diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index c3cf714ce6c1..2dc562932845 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -19,6 +19,7 @@
 PROJECT_DEPENDENCIES = {
     "llvm": set(),
     "clang": {"llvm"},
+    "CIR": {"clang", "mlir"},
     "bolt": {"clang", "lld", "llvm"},
     "clang-tools-extra": {"clang", "llvm"},
     "compiler-rt": {"clang", "lld"},
@@ -55,6 +56,7 @@
     ".ci": {
         "llvm",
         "clang",
+        "CIR",
         "lld",
         "lldb",
         "bolt",
@@ -128,6 +130,7 @@
     "lldb": "check-lldb",
     "llvm": "check-llvm",
     "clang": "check-clang",
+    "CIR": "check-clang-cir",
     "bolt": "check-bolt",
     "lld": "check-lld",
     "flang": "check-flang",
@@ -141,6 +144,23 @@
 
 RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc"}
 
+# Meta projects are projects that need explicit handling but do not reside
+# in their own top level folder. To add a meta project, the start of the path
+# for the metaproject should be mapped to the name of the project below.
+# Multiple paths can map to the same metaproject.
+META_PROJECTS = {
+    ("clang", "lib", "CIR"): "CIR",
+    ("clang", "test", "CIR"): "CIR",
+    ("clang", "include", "clang", "CIR"): "CIR",
+    ("*", "docs"): "docs",
+    ("llvm", "utils", "gn"): "gn",
+    (".github", "workflows", "premerge.yaml"): ".ci",
+    ("third-party",): ".ci",
+}
+
+# Projects that should not run any tests. These need to be metaprojects.
+SKIP_PROJECTS = ["docs", "gn"]
+
 
 def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
     projects_with_dependents = set(projects)
@@ -233,21 +253,34 @@ def _compute_runtimes_to_build(
     return _exclude_projects(runtimes_to_build, platform)
 
 
+def _path_matches(matcher: tuple[str], file_path: tuple[str]) -> bool:
+    if len(file_path) < len(matcher):
+        return False
+    for match_part, file_part in zip(matcher, file_path):
+        if match_part == "*" or file_part == "*":
+            continue
+        if match_part != file_part:
+            return False
+    return True
+
+
+def _get_modified_projects_for_file(modified_file: str) -> Set[str]:
+    modified_projects = set()
+    path_parts = pathlib.Path(modified_file).parts
+    for meta_project_files in META_PROJECTS.keys():
+        if _path_matches(meta_project_files, path_parts):
+            meta_project = META_PROJECTS[meta_project_files]
+            if meta_project in SKIP_PROJECTS:
+                return set()
+            modified_projects.add(meta_project)
+    modified_projects.add(pathlib.Path(modified_file).parts[0])
+    return modified_projects
+
+
 def _get_modified_projects(modified_files: list[str]) -> Set[str]:
     modified_projects = set()
     for modified_file in modified_files:
-        path_parts = pathlib.Path(modified_file).parts
-        # Exclude files in the docs directory. They do not impact an test
-        # targets and there is a separate workflow used for ensuring the
-        # documentation builds.
-        if len(path_parts) > 2 and path_parts[1] == "docs":
-            continue
-        # Exclude files for the gn build. We do not test it within premerge
-        # and changes occur often enough that they otherwise take up
-        # capacity.
-        if len(path_parts) > 3 and path_parts[:3] == ("llvm", "utils", "gn"):
-            continue
-        modified_projects.add(pathlib.Path(modified_file).parts[0])
+        modified_projects.update(_get_modified_projects_for_file(modified_file))
     return modified_projects
 
 
@@ -267,6 +300,13 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
         runtimes_to_test_needs_reconfig
     )
+
+    # CIR is used as a pseudo-project in this script. It is built as part of the
+    # clang build, but it requires an explicit option to enable. We set that
+    # option here, and remove it from the projects_to_build list.
+    enable_cir = "ON" if "CIR" in projects_to_build else "OFF"
+    projects_to_build.discard("CIR")
+
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -279,6 +319,7 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "runtimes_check_targets_needs_reconfig": " ".join(
             sorted(runtimes_check_targets_needs_reconfig)
         ),
+        "enable_cir": enable_cir,
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6299931e1ec3..11c4aea9b4e3 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -1,7 +1,7 @@
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""Does some stuff."""
+"""Tests for compute_projects.py"""
 
 import unittest
 
@@ -104,6 +104,10 @@ def test_clang(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(
+            env_variables["enable_cir"],
+            "OFF",
+        )
 
     def test_clang_windows(self):
         env_variables = compute_projects.get_env_variables(
@@ -126,6 +130,32 @@ def test_clang_windows(self):
             env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
+        self.assertEqual(env_variables["enable_cir"], "OFF")
+
+    def test_cir(self):
+        env_variables = compute_projects.get_env_variables(
+            ["clang/lib/CIR/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "clang;clang-tools-extra;lld;llvm;mlir",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-clang check-clang-cir check-clang-tools",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+        self.assertEqual(env_variables["enable_cir"], "ON")
 
     def test_bolt(self):
         env_variables = compute_projects.get_env_variables(
@@ -158,6 +188,7 @@ def test_mlir(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -168,10 +199,11 @@ def test_flang(self):
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+        self.assertEqual(env_variables["enable_cir"], "OFF")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
-            ["third-party/benchmark/CMakeLists.txt"], "Linux"
+            ["llvm-libgcc/CMakeLists.txt"], "Linux"
         )
         self.assertEqual(env_variables["projects_to_build"], "")
         self.assertEqual(env_variables["project_check_targets"], "")
@@ -237,7 +269,7 @@ def test_ci(self):
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-bolt check-clang check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
         )
         self.assertEqual(
             env_variables["runtimes_to_build"],
@@ -276,6 +308,66 @@ def test_clang_tools_extra(self):
         self.assertEqual(env_variables["runtimes_check_targets"], "check-libc")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
+    def test_premerge_workflow(self):
+        env_variables = compute_projects.get_env_variables(
+            [".github/workflows/premerge.yaml"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "bolt;clang;clang-tools-extra;flang;libclc;lld;lldb;llvm;mlir;polly",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"],
+            "compiler-rt;libc;libcxx;libcxxabi;libunwind",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt check-libc",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+
+    def test_other_github_workflow(self):
+        env_variables = compute_projects.get_env_variables(
+            [".github/workflows/docs.yml"], "Linux"
+        )
+        self.assertEqual(env_variables["projects_to_build"], "")
+        self.assertEqual(env_variables["project_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_to_build"], "")
+        self.assertEqual(env_variables["runtimes_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+
+    def test_third_party_benchmark(self):
+        env_variables = compute_projects.get_env_variables(
+            ["third-party/benchmark/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(
+            env_variables["projects_to_build"],
+            "bolt;clang;clang-tools-extra;flang;libclc;lld;lldb;llvm;mlir;polly",
+        )
+        self.assertEqual(
+            env_variables["project_check_targets"],
+            "check-bolt check-clang check-clang-cir check-clang-tools check-flang check-lld check-lldb check-llvm check-mlir check-polly",
+        )
+        self.assertEqual(
+            env_variables["runtimes_to_build"],
+            "compiler-rt;libc;libcxx;libcxxabi;libunwind",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets"],
+            "check-compiler-rt check-libc",
+        )
+        self.assertEqual(
+            env_variables["runtimes_check_targets_needs_reconfig"],
+            "check-cxx check-cxxabi check-unwind",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 143e6ab4cf46..26fdeef1913a 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -1,3 +1,13 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Collects Github metrics and uploads them to Grafana.
+
+This script contains machinery that will pull metrics periodically from Github
+about workflow runs. It will upload the collected metrics to the specified
+Grafana instance.
+"""
+
 import collections
 import datetime
 import github
diff --git a/.ci/metrics/metrics_test.py b/.ci/metrics/metrics_test.py
new file mode 100644
index 000000000000..259e55f81793
--- /dev/null
+++ b/.ci/metrics/metrics_test.py
@@ -0,0 +1,75 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tests for metrics.py"""
+
+from dataclasses import dataclass
+import requests
+import unittest
+import unittest.mock
+
+import metrics
+
+
+class TestMetrics(unittest.TestCase):
+    def test_upload_gauge_metric(self):
+        """Test that we can upload a gauge metric correctly.
+
+        Also verify that we pass around parameters like API keys and user IDs
+        correctly to the HTTP POST request.
+        """
+        test_metrics = [metrics.GaugeMetric("gauge_test", 5, 1000)]
+        return_value = requests.Response()
+        return_value.status_code = 204
+        with unittest.mock.patch(
+            "requests.post", return_value=return_value
+        ) as post_mock:
+            metrics.upload_metrics(test_metrics, "test_userid", "test_api_key")
+            self.assertSequenceEqual(post_mock.call_args.args, [metrics.GRAFANA_URL])
+            self.assertEqual(
+                post_mock.call_args.kwargs["data"], "gauge_test value=5 1000"
+            )
+            self.assertEqual(
+                post_mock.call_args.kwargs["auth"], ("test_userid", "test_api_key")
+            )
+
+    def test_upload_job_metric(self):
+        """Test that we can upload a job metric correctly."""
+        test_metrics = [
+            metrics.JobMetrics("test_job", 5, 10, 1, 1000, 7, "test_workflow")
+        ]
+        return_value = requests.Response()
+        return_value.status_code = 204
+        with unittest.mock.patch(
+            "requests.post", return_value=return_value
+        ) as post_mock:
+            metrics.upload_metrics(test_metrics, "test_userid", "test_aoi_key")
+            self.assertEqual(
+                post_mock.call_args.kwargs["data"],
+                "test_job queue_time=5,run_time=10,status=1 1000",
+            )
+
+    def test_upload_unknown_metric(self):
+        """Test we report an error if we encounter an unknown metric type."""
+
+        @dataclass
+        class FakeMetric:
+            fake_data: str
+
+        test_metrics = [FakeMetric("test")]
+
+        with self.assertRaises(ValueError):
+            metrics.upload_metrics(test_metrics, "test_userid", "test_api_key")
+
+    def test_bad_response_code(self):
+        """Test that we gracefully handle HTTP response errors."""
+        test_metrics = [metrics.GaugeMetric("gauge_test", 5, 1000)]
+        return_value = requests.Response()
+        return_value.status_code = 403
+        # Just assert that we continue running here and do not raise anything.
+        with unittest.mock.patch("requests.post", return_value=return_value) as _:
+            metrics.upload_metrics(test_metrics, "test_userid", "test_api_key")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 8d1faab13986..6db24d894eb7 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -21,12 +21,7 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 INSTALL_DIR="${BUILD_DIR}/install"
 rm -rf "${BUILD_DIR}"
 
-ccache --zero-stats
-
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing cache"
-  ccache --clear
-fi
+sccache --zero-stats
 
 mkdir -p artifacts/reproducers
 
@@ -36,7 +31,7 @@ export CLANG_CRASH_DIAGNOSTICS_DIR=`realpath artifacts/reproducers`
 function at-exit {
   retcode=$?
 
-  ccache --print-stats > artifacts/ccache_stats.txt
+  sccache --show-stats > artifacts/sccache_stats.txt
   cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log
   cp "${BUILD_DIR}"/test-results.*.xml artifacts/ || :
 
@@ -53,6 +48,7 @@ targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
 runtime_targets_needs_reconfig="${5}"
+enable_cir="${6}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -72,13 +68,15 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -G Ninja \
       -D CMAKE_PREFIX_PATH="${HOME}/.local" \
       -D CMAKE_BUILD_TYPE=Release \
+      -D CLANG_ENABLE_CIR=${enable_cir} \
       -D LLVM_ENABLE_ASSERTIONS=ON \
       -D LLVM_BUILD_EXAMPLES=ON \
       -D COMPILER_RT_BUILD_LIBFUZZER=OFF \
       -D LLVM_LIT_ARGS="${lit_args}" \
       -D LLVM_ENABLE_LLD=ON \
       -D CMAKE_CXX_FLAGS=-gmlt \
-      -D LLVM_CCACHE_BUILD=ON \
+      -D CMAKE_C_COMPILER_LAUNCHER=sccache \
+      -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
       -D LIBCXX_CXX_ABI=libcxxabi \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D LLDB_ENABLE_PYTHON=ON \
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 176350fac604..50a741677d73 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -21,11 +21,6 @@ BUILD_DIR="${BUILD_DIR:=${MONOREPO_ROOT}/build}"
 
 rm -rf "${BUILD_DIR}"
 
-if [[ -n "${CLEAR_CACHE:-}" ]]; then
-  echo "clearing sccache"
-  rm -rf "$SCCACHE_DIR"
-fi
-
 sccache --zero-stats
 function at-exit {
   retcode=$?
diff --git a/.github/workflows/atfe_post_merge.yml b/.github/workflows/atfe_post_merge.yml
index 2a6dac25e958..fb34460f918e 100644
--- a/.github/workflows/atfe_post_merge.yml
+++ b/.github/workflows/atfe_post_merge.yml
@@ -19,8 +19,28 @@ on:
   pull_request:
     types: [closed]
     branches:
-      - arm-toolchain
-      - release/arm-software/**
+      - 'arm-toolchain'
+      - 'release/arm-software/**'
+    paths:
+      # Run workflow if the PR modifies any of the following directories
+      - 'arm-software/embedded/**'
+      - 'clang/**'
+      - 'cmake/**'
+      - 'compiler-rt/**'
+      - 'libc/**'
+      - 'libcxx/**'
+      - 'libcxxabi/**'
+      - 'libunwind/**'
+      - 'lld/**'
+      - 'llvm/**'
+      - 'runtimes/**'
+      - 'utils/**'
+      # Exclude directories that are not relevant for ATfE
+      - '!arm-software/linux/**'
+      # Exclude changes in files with these extensions
+      - '!**/*.cfg'
+      - '!**/*.md'
+      - '!**/*.rst'
 
   # Trigger after the Automerge workflow completes
   workflow_run:
diff --git a/.github/workflows/atfe_pre_merge.yml b/.github/workflows/atfe_pre_merge.yml
index a1f0b90b536b..a137f7621237 100644
--- a/.github/workflows/atfe_pre_merge.yml
+++ b/.github/workflows/atfe_pre_merge.yml
@@ -18,8 +18,28 @@ on:
   pull_request:
     types: [opened, synchronize, reopened]
     branches:
-      - arm-software
-      - release/arm-software/**
+      - 'arm-software'
+      - 'release/arm-software/**'
+    paths:
+      # Run workflow if the PR modifies any of the following directories
+      - 'arm-software/embedded/**'
+      - 'clang/**'
+      - 'cmake/**'
+      - 'compiler-rt/**'
+      - 'libc/**'
+      - 'libcxx/**'
+      - 'libcxxabi/**'
+      - 'libunwind/**'
+      - 'lld/**'
+      - 'llvm/**'
+      - 'runtimes/**'
+      - 'utils/**'
+      # Exclude directories that are not relevant for ATfE
+      - '!arm-software/linux/**'
+      # Exclude changes in files with these extensions
+      - '!**/*.cfg'
+      - '!**/*.md'
+      - '!**/*.rst'
 
 jobs:
   build-and-test:
diff --git a/.github/workflows/automerge.yml b/.github/workflows/automerge.yml
index be0265efaf04..08ad5bd7a9bf 100644
--- a/.github/workflows/automerge.yml
+++ b/.github/workflows/automerge.yml
@@ -19,8 +19,6 @@ jobs:
         branches:
           - from_branch: main
             to_branch: arm-software
-          - from_branch: release/20.x
-            to_branch: release/arm-software/20.x
     steps:
       - name: Configure Access Token
         uses: actions/create-github-app-token@v1
diff --git a/.github/workflows/check_downstream_changes.yml b/.github/workflows/check_downstream_changes.yml
index 3556dfbdc390..8820a77a0b99 100644
--- a/.github/workflows/check_downstream_changes.yml
+++ b/.github/workflows/check_downstream_changes.yml
@@ -34,18 +34,10 @@ jobs:
     if: github.repository == 'arm/arm-toolchain'
 
     steps:
-      # Generate a token for gh tool
-      - name: Configure Access Token
-        uses: actions/create-github-app-token@v1
-        id: generate-token
-        with:
-          app-id: ${{ secrets.SYNC_APP_ID }}
-          private-key: ${{ secrets.SYNC_APP_PRIVATE_KEY }}
-
       - name: Checkout
         uses: actions/checkout@v4
 
       - name: Run Check Script
         run: python3 arm-software/ci/check_downstream_changes.py --repo ${{ github.repository }} --pr ${{ github.event.pull_request.number }}
         env:
-          GH_TOKEN: ${{ steps.generate-token.outputs.token }}
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/sync_from_upstream.yml b/.github/workflows/sync_from_upstream.yml
index 81a31bc545bb..864a40893685 100644
--- a/.github/workflows/sync_from_upstream.yml
+++ b/.github/workflows/sync_from_upstream.yml
@@ -12,7 +12,7 @@ jobs:
     if: github.repository == 'arm/arm-toolchain'
     strategy:
       matrix:
-        branch: [main, release/20.x]
+        branch: [main, release/21.x]
     env:
       UPSTREAM_REMOTE: https://github.com/llvm/llvm-project.git
     steps:
diff --git a/arm-software/ci/check_downstream_changes.py b/arm-software/ci/check_downstream_changes.py
index d597053e47fb..13b7cb1ff9b9 100755
--- a/arm-software/ci/check_downstream_changes.py
+++ b/arm-software/ci/check_downstream_changes.py
@@ -29,6 +29,9 @@
 logger = logging.getLogger(__name__)
 
 MERGE_IGNORE_PATHSPEC_FILE = Path(__file__).parent / ".automerge_ignore"
+HELP_COMMENT_TEXT = """This pull review modifies files outside of the `arm-software` directory, so please ensure it follows the [Downstream Patch Policy](https://github.com/arm/arm-toolchain/blob/arm-software/CONTRIBUTING.md#downstream-patch-policy).
+An automated check will test if the tagging requirements have been met. Please wait for approving reviews from both Arm Toolchain for Embedded and Arm Toolchain for Linux teams before merging."""
+DOWNSTREAM_CHANGE_LABEL = "downstream-change"
 
 
 # Check gh is working before using it
@@ -55,7 +58,16 @@ def check_gh_status() -> None:
 
 # Use gh to get information about the pull request.
 def get_pr_json(pr_num: str, repo: str) -> dict:
-    args = ["gh", "pr", "view", pr_num, "--repo", repo, "--json", "body,files,title"]
+    args = [
+        "gh",
+        "pr",
+        "view",
+        pr_num,
+        "--repo",
+        repo,
+        "--json",
+        "body,comments,files,labels,title",
+    ]
     logger.debug(f"Running `{shlex.join(args)}`")
     try:
         p = subprocess.run(
@@ -159,7 +171,7 @@ def has_downstream_changes(input_json: dict) -> bool:
 def find_pr_issue(input_json: dict) -> str:
     logger.debug("body text: %s", input_json["body"])
     matches = re.findall(
-        "^((?:removes )?downstream issue: *#([0-9]+))",
+        r"^\s*((?:removes )?downstream issue: *#([0-9]+))",
         input_json["body"],
         flags=re.I | re.M,
     )
@@ -180,6 +192,91 @@ def find_pr_issue(input_json: dict) -> str:
     return issue_num
 
 
+# Add a label to a pull request.
+def add_pr_label(pr_num: str, repo: str, input_json: dict) -> None:
+    # Check if the issue is already labelled.
+    for label_json in input_json["labels"]:
+        if label_json["name"] == DOWNSTREAM_CHANGE_LABEL:
+            return
+    args = [
+        "gh",
+        "pr",
+        "edit",
+        pr_num,
+        "--repo",
+        repo,
+        "--add-label",
+        DOWNSTREAM_CHANGE_LABEL,
+    ]
+    logger.debug(f"Running `{shlex.join(args)}`")
+    try:
+        p = subprocess.run(
+            args,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as error:
+        logger.error(
+            f"Check error. Failure adding pr label\ncmd:{shlex.join(error.cmd)}\ncode:{error.returncode}\nstdout:{error.stdout}\nstderr:{error.stderr}"
+        )
+        sys.exit(1)
+
+
+# Add a label to an issue.
+def add_issue_label(issue_num: str, repo: str, input_json: dict) -> None:
+    # Check if the issue is already labelled.
+    for label_json in input_json["labels"]:
+        if label_json["name"] == DOWNSTREAM_CHANGE_LABEL:
+            return
+    args = [
+        "gh",
+        "issue",
+        "edit",
+        issue_num,
+        "--repo",
+        repo,
+        "--add-label",
+        DOWNSTREAM_CHANGE_LABEL,
+    ]
+    logger.debug(f"Running `{shlex.join(args)}`")
+    try:
+        p = subprocess.run(
+            args,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as error:
+        logger.error(
+            f"Check error. Failure adding issue label\ncmd:{shlex.join(error.cmd)}\ncode:{error.returncode}\nstdout:{error.stdout}\nstderr:{error.stderr}"
+        )
+        sys.exit(1)
+
+
+# Add a comment explaining the additional requirements for downstream changes
+def add_help_comment(pr_num: str, repo: str, input_json: dict) -> None:
+    # Check if the comment has already been made first, so as not to repost
+    # every time the script runs.
+    for comment_json in input_json["comments"]:
+        if comment_json["body"] == HELP_COMMENT_TEXT:
+            return
+    args = ["gh", "pr", "comment", pr_num, "--repo", repo, "--body", HELP_COMMENT_TEXT]
+    logger.debug(f"Running `{shlex.join(args)}`")
+    try:
+        p = subprocess.run(
+            args,
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError as error:
+        logger.error(
+            f"Check error. Failure adding comment\ncmd:{shlex.join(error.cmd)}\ncode:{error.returncode}\nstdout:{error.stdout}\nstderr:{error.stderr}"
+        )
+        sys.exit(1)
+
+
 def main():
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument(
@@ -197,6 +294,11 @@ def main():
         action="store_true",
         help="Print verbose log messages",
     )
+    parser.add_argument(
+        "--dryrun",
+        action="store_true",
+        help="Don't make modifications such as comments or labels",
+    )
 
     args = parser.parse_args()
 
@@ -213,6 +315,9 @@ def main():
 
     link_text = "Please check https://github.com/arm/arm-toolchain/blob/arm-software/CONTRIBUTING.md#downstream-patch-policy for information on the downstream patch policy and how changes need to be tracked."
     if needs_tagging:
+        if not args.dryrun:
+            add_pr_label(args.pr, args.repo, pr_json)
+            add_help_comment(args.pr, args.repo, pr_json)
         if issue_num is None:
             logger.info(
                 f"Check failed. Pull request #{args.pr} contains downstream changes, but does not have a correctly formatted link to a downstream tracking issue. {link_text}"
@@ -225,6 +330,8 @@ def main():
                 )
                 sys.exit(1)
             else:
+                if not args.dryrun:
+                    add_issue_label(args.pr, args.repo, pr_json)
                 logger.info(
                     f"Check passed. Pull request #{args.pr} contains downstream changes, and a correctly formatted link to a downstream tracking issue."
                 )
diff --git a/arm-software/docs/arm-toolchain-features.md b/arm-software/docs/arm-toolchain-features.md
index f4a59310eec0..a55540d2facb 100644
--- a/arm-software/docs/arm-toolchain-features.md
+++ b/arm-software/docs/arm-toolchain-features.md
@@ -20,7 +20,9 @@ There are no experimental features implemented.
 # Features
 
 ## Additional loop unroll in the LTO pipeline
-In some cases it is benefitial to perform an additional loop unroll pass so that extra information becomes available to later passes, e.g. SROA.
+In some cases it is beneficial to perform an additional loop unroll pass
+during Link Time Optimization (LTO) so that extra information becomes available
+to later passes, e.g. Scalar Replacement of Aggregates (SROA) pass.
 Use cases where this could be beneficial - multiple (N>=4) nested loops.
 
 Usage:
diff --git a/arm-software/embedded/CHANGELOG.md b/arm-software/embedded/CHANGELOG.md
index 0900e1e60cab..2648986f7753 100644
--- a/arm-software/embedded/CHANGELOG.md
+++ b/arm-software/embedded/CHANGELOG.md
@@ -1,15 +1,56 @@
 # Changelog
 
 All notable changes to this project will be documented in this file.
-This release is migrated from [LLVM-ET](https://github.com/ARM-software/LLVM-embedded-toolchain-for-Arm).
-The package structure remains the same, making this a direct successor release.
 
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
+### Added
+### Changed
+### Deprecated
+### Removed
+### Fixed
+### Security
+
+## [21.1.0]
+
+### Added
+
+- [`elf2bin`](../docs/elf2bin.md) utility added.
+- Use of zlib on Linux and macOS to allow compression of debug data with `-gz=zlib`.
+- C++ standard library `<fstream>` header support with `picolibc` and `newlib`.
+- C++ standard library support with LLVM libc overlay package.
+- Startup code, C++ sample for LLVM libc overlay package.
+
+### Changed
+
+- Multiple changes in added and updated library variants to improve coverage
+of supported targets.
+- Common header files for C and C++ libraries variants are now centralized under
+each target triple to avoid duplication and significantly reduce package size.
+- Arm AEABI memory builtins that are provided by the C library were excluded from the `compiler-rt` build, `-meabi gnu` should be used with `-nostdlib` to prevent the compiler from using the missing builtins.
+- `picolibc` now uses `__bothinit_array_start` and `__bothinit_array_end`
+linker defined symbols to run the static constructors instead of the traditional
+`__init_array_start` and `__init_array_end`. Linker scripts will need to be
+updated to define the `__bothinit_array_start` and `__bothinit_array_end` symbols.
+See the `picolibc.ld` linker script for an example.
+
+### Removed
+
+- Support for using `--sysroot` to point to a specific library variant directly,
+multilib driver selection logic should be used instead.
+
+### Fixed
+
+- Using samples on Windows.
+- Producing 64-bit compiler and tools binaries on Windows.
+
 ## [20.1.0]
 
+This release is migrated from [LLVM-ET](https://github.com/ARM-software/LLVM-embedded-toolchain-for-Arm).
+The package structure remains the same, making this a direct successor release.
+
 ### Added
 - Support for targeting AArch64 v8-R in both big-endian and little-endian modes (#64) (#68) (#102).
 - Additional library variants for AArch32 with strict alignment (LLVM-ET #605).
diff --git a/arm-software/embedded/CMakeLists.txt b/arm-software/embedded/CMakeLists.txt
index 95e74fdb695c..1df9116ed757 100644
--- a/arm-software/embedded/CMakeLists.txt
+++ b/arm-software/embedded/CMakeLists.txt
@@ -180,6 +180,11 @@ option(
     OFF
 )
 
+option(
+    ENABLE_MULTILIB_HEADER_OPTIMISATION "Enable multilib header optimisation phase"
+    ON
+)
+
 # Previously, the LLVM_TOOLCHAIN_LIBRARY_OVERLAY_INSTALL option was
 # called LLVM_TOOLCHAIN_NEWLIB_OVERLAY_INSTALL. Detect a setting of
 # that name, in case it's in an existing CMakeCache.txt or command
@@ -348,7 +353,6 @@ install(
 # Generate a toolchain ID.
 # Product code 'E' for Embedded.
 set(LLVM_TOOLCHAIN_PROJECT_CODE "E")
-set(LLVM_TOOLCHAIN_VERSION_SUFFIX "pre")
 include(${CMAKE_CURRENT_SOURCE_DIR}/../shared/cmake/generate_toolchain_id.cmake)
 
 # Include elf2bin
@@ -646,6 +650,7 @@ if(NOT PREBUILT_TARGET_LIBRARIES)
         -DLLVM_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}/llvm
         -DMULTILIB_JSON=${LLVM_TOOLCHAIN_MULTILIB_JSON}
         -DENABLE_VARIANTS=${ENABLE_VARIANTS_PASSTHROUGH}
+	-DENABLE_MULTILIB_HEADER_OPTIMISATION=${ENABLE_MULTILIB_HEADER_OPTIMISATION}
         -DENABLE_PARALLEL_LIB_CONFIG=${ENABLE_PARALLEL_LIB_CONFIG}
         -DENABLE_PARALLEL_LIB_BUILD=${ENABLE_PARALLEL_LIB_BUILD}
         -DPARALLEL_LIB_BUILD_LEVELS=${PARALLEL_LIB_BUILD_LEVELS}
diff --git a/arm-software/embedded/README.md b/arm-software/embedded/README.md
index 0ed2efd6ce47..3bb2dd71155e 100644
--- a/arm-software/embedded/README.md
+++ b/arm-software/embedded/README.md
@@ -142,24 +142,10 @@ To display the directory selected by the multilib system, add the flag
 To display all available multilibs run `clang` with the flag `-print-multi-lib`
 and a target triple like `--target=aarch64-none-elf` or `--target=arm-none-eabi`.
 
-It's possible that `clang` will choose a set of libraries that are not the ones
-you want to use. In this case you can bypass the multilib system by providing a
-`--sysroot` option specifying the directory containing the `include` and `lib`
-directories of the libraries you want to use. For example:
-
-```
-$ clang \
---sysroot=<install-dir>/ATfE-<revision>/lib/clang-runtimes/arm-none-eabi/armv6m_soft_nofp \
---target=armv6m-none-eabi \
--mfpu=none \
--fno-exceptions \
--fno-rtti \
--nostartfiles \
--lcrt0-semihost \
--lsemihost \
--T picolibc.ld \
--o example example.c
-```
+> [!WARNING]
+> `--sysroot` is not a substitute for multilib selection.
+> The `--sysroot` option cannot be used to override multilib logic
+> or manually select an arbitrary library variant.
 
 The FPU selection can be skipped, but it is not recommended to as the defaults
 are different to GCC ones.
diff --git a/arm-software/embedded/arm-multilib/CMakeLists.txt b/arm-software/embedded/arm-multilib/CMakeLists.txt
index 041ddc30d802..ec7fc46bb965 100644
--- a/arm-software/embedded/arm-multilib/CMakeLists.txt
+++ b/arm-software/embedded/arm-multilib/CMakeLists.txt
@@ -48,6 +48,7 @@ set(
     fvp/get_fvps.sh"
 )
 set(FVP_CONFIG_DIR "${TOOLCHAIN_SOURCE_DIR}/fvp/config" CACHE STRING "The directory in which the FVP models are installed.")
+option(ENABLE_MULTILIB_HEADER_OPTIMISATION "Enable multilib header optimisation phase" ON)
 option(
     ENABLE_PARALLEL_LIB_CONFIG
     "Run the library variant configuration steps in parallel."
@@ -172,6 +173,7 @@ foreach(lib_idx RANGE ${lib_count_dec})
     endif()
 
     if(variant IN_LIST ENABLE_VARIANTS OR ENABLE_VARIANTS STREQUAL "all")
+        list(APPEND enabled_variants ${variant})
         string(JSON variant_multilib_flags GET ${lib_def} "flags")
         # Placeholder libraries won't have a json, so store the error in
         # a variable so a fatal error isn't generated.
@@ -185,10 +187,12 @@ foreach(lib_idx RANGE ${lib_count_dec})
                 set(parent_dir_name arm-none-eabi)
             endif()
             set(destination_directory "${CMAKE_CURRENT_BINARY_DIR}/multilib/${parent_dir_name}/${variant}")
-            install(
-                DIRECTORY ${destination_directory}
-                DESTINATION ${parent_dir_name}
-            )
+            if(NOT ENABLE_MULTILIB_HEADER_OPTIMISATION)
+                install(
+                    DIRECTORY ${destination_directory}
+                    DESTINATION ${parent_dir_name}
+                )
+            endif()
             set(variant_json_file ${CMAKE_CURRENT_SOURCE_DIR}/json/variants/${variant_json})
 
             # Read info from the variant specific json.
@@ -251,6 +255,8 @@ foreach(lib_idx RANGE ${lib_count_dec})
                 TEST_EXCLUDE_FROM_MAIN TRUE
             )
 
+            list(APPEND all_runtime_targets runtimes-${variant})
+
             if(ENABLE_PARALLEL_LIB_CONFIG OR ENABLE_PARALLEL_LIB_BUILD)
                 # Create additional steps to configure/build the subprojects.
                 # These are collected to be run together, so that all the
@@ -337,6 +343,11 @@ foreach(lib_idx RANGE ${lib_count_dec})
             foreach(flag ${multilib_flags_list})
                 string(APPEND multilib_yaml_content "  - ${flag}\n")
             endforeach()
+            if(ENABLE_MULTILIB_HEADER_OPTIMISATION)
+                string(APPEND multilib_yaml_content "  IncludeDirs:\n")
+                string(APPEND multilib_yaml_content "  - ${parent_dir_name}/${variant}/include\n")
+                string(APPEND multilib_yaml_content "  - ${parent_dir_name}/include\n")
+            endif()
             string(APPEND multilib_yaml_content "  Group: stdlibs\n")
         else()
             # In place of a json, an error message is expected.
@@ -354,6 +365,16 @@ foreach(lib_idx RANGE ${lib_count_dec})
 
 endforeach()
 
+# Check that all variants that were configured to be enabled
+# were actually enabled.
+if(NOT ENABLE_VARIANTS STREQUAL "all")
+    foreach(expected_variant ${ENABLE_VARIANTS})
+        if(NOT expected_variant IN_LIST enabled_variants)
+            message(FATAL_ERROR "Variant name ${expected_variant} not found in ${MULTILIB_JSON}")
+        endif()
+    endforeach()
+endif()
+
 # Multilib file is generated in two parts.
 # 1. Template is filled with multilib flags from json
 configure_file(
@@ -386,7 +407,25 @@ add_custom_command(
 )
 
 add_custom_target(multilib-yaml ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/multilib/multilib.yaml)
-install(
-    FILES ${CMAKE_CURRENT_BINARY_DIR}/multilib/multilib.yaml
-    DESTINATION .
-)
+if(NOT ENABLE_MULTILIB_HEADER_OPTIMISATION)
+    install(
+        FILES ${CMAKE_CURRENT_BINARY_DIR}/multilib/multilib.yaml
+        DESTINATION .
+    )
+endif()
+
+# Extract common header files which spans across multilib variant directories.
+if(ENABLE_MULTILIB_HEADER_OPTIMISATION)
+    add_custom_target(generate-multilib-common-headers ALL
+        COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_SOURCE_DIR}/common-headers-generate.py"
+                "${CMAKE_CURRENT_BINARY_DIR}/multilib"
+                "${CMAKE_CURRENT_BINARY_DIR}/multilib-optimised"
+        COMMENT "Generating common headers"
+        DEPENDS ${all_runtime_targets} multilib-yaml
+    )
+
+    install(
+        DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/multilib-optimised/"
+        DESTINATION .
+    )
+endif()
diff --git a/arm-software/embedded/arm-multilib/common-headers-generate.py b/arm-software/embedded/arm-multilib/common-headers-generate.py
new file mode 100755
index 000000000000..55afe392edcf
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/common-headers-generate.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+
+"""
+Identifies and extracts header files that are common across multiple multilib variant directories.
+
+This script scans all variant folders within a multilib target directory
+(e.g., `arm-none-eabi/variant1/include`, `arm-none-eabi/variant2/include`, etc.) and compares all
+`.h` files. If the same file (by name and content) appears in multiple variants, it will be moved
+to a shared include directory at:
+
+    <CMAKE_BINARY_DIR>/multilib-optimised/<target>/include/
+
+for the following multilib targets:
+- arm-none-eabi
+- aarch64-none-elf
+
+Arguments:
+    <CMAKE_BINARY_DIR>/multilib Path to the CMake build directory containing non optmised multilib.
+    eg: build/multilib-builds/multilib/picolibc-build/multilib
+    <CMAKE_BINARY_DIR>/multilib-optimised  Path to the CMake build directory where optimised multilib should be generated..
+    eg: build/multilib-builds/multilib/picolibc-build/multilib-optimised
+
+This is useful to reduce duplication in the toolchain by centralising common headers
+that are shared across architecture variants.
+"""
+import argparse
+import os
+import filecmp
+import shutil
+
+# Define the multilib target dirs which want to process
+MULTILIB_TARGET_DIRS = ["arm-none-eabi", "aarch64-none-elf"]
+
+
+def files_are_identical(f1, f2):
+    return filecmp.cmp(f1, f2, shallow=False)
+
+
+def collect_variant_include_paths(input_target_dir):
+    """
+    Extracts each multilib variant and its corresponding include path from the non-optimised multilib directory.
+    Stores the results to enable later comparison of header contents across different non-optimised multilib variant
+    include paths.
+
+    """
+    variant_include_path = {}
+    for variant in os.listdir(input_target_dir):
+        include_path = os.path.join(input_target_dir, variant, "include")
+        if os.path.isdir(include_path):
+            variant_include_path[variant] = include_path
+    return variant_include_path
+
+
+def extract_common_headers_for_targets(args):
+    if os.path.exists(args.multilib_optimised_dir):
+        shutil.rmtree(args.multilib_optimised_dir)
+
+    if os.path.isdir(args.multilib_non_optimised_dir):
+        existing_target_dirs = [
+            dir_name
+            for dir_name in MULTILIB_TARGET_DIRS
+            if os.path.isdir(os.path.join(args.multilib_non_optimised_dir, dir_name))
+        ]
+        if not existing_target_dirs:
+            raise Exception(
+                f"Error: Expected to find either arm-none-eabi or aarch64-none-elf in '{args.multilib_non_optimised_dir}', but folder is empty."
+            )
+        src_yaml = os.path.join(args.multilib_non_optimised_dir, "multilib.yaml")
+        if not os.path.exists(src_yaml):
+            raise FileNotFoundError(f"Source yaml '{src_yaml}' does not exist.")
+    else:
+        raise FileNotFoundError(
+            f"Error: Expected folder '{args.multilib_non_optimised_dir}' does not exist"
+        )
+
+    for target in MULTILIB_TARGET_DIRS:
+        input_target_dir = os.path.join(
+            os.path.abspath(args.multilib_non_optimised_dir), target
+        )
+        output_target_dir = os.path.join(
+            os.path.abspath(args.multilib_optimised_dir), target
+        )
+        output_include_dir = os.path.join(output_target_dir, "include")
+
+        if not os.path.isdir(input_target_dir):
+            print(
+                f"Skipping extracting the common headers for {target}: input path {input_target_dir} not found"
+            )
+            continue
+
+        variant_includes = collect_variant_include_paths(input_target_dir)
+        if len(variant_includes) < 2:
+            print(
+                f"Skipping extracting the common headers for {target}: not enough variants to compare. "
+                "At least two variants must be enabled for the multilib header optimisation phase to proceed."
+            )
+            # The script always creates the multilib-optimised folder, even when there's only one variant and no
+            # optimization is applied. In that case, multilib-optimised will just contain a copy of the
+            # single variant from the non-optimised multilib directory.
+            if os.path.exists(input_target_dir):
+                shutil.copytree(input_target_dir, output_target_dir, dirs_exist_ok=False)
+            continue
+
+        # Creating the common include headers for each target
+        os.makedirs(output_include_dir, exist_ok=False)
+
+        # Step 1: compare first two variants and extract the common headers into the targets common include directory
+        base_dir = list(variant_includes.values())[0]
+        compare_dir = list(variant_includes.values())[1]
+        for root, sub_dirs, header_files in os.walk(base_dir):
+            sub_dir_root = os.path.relpath(root, base_dir)
+            for header in header_files:
+                h1 = os.path.join(base_dir, sub_dir_root, header)
+                h2 = os.path.join(compare_dir, sub_dir_root, header)
+                if os.path.exists(h2) and files_are_identical(h1, h2):
+                    out_dir = os.path.join(output_include_dir, sub_dir_root)
+                    os.makedirs(out_dir, exist_ok=True)
+                    shutil.copy2(h1, os.path.join(out_dir, header))
+
+        # Step 2: Compare all the variants with the new common include. Any headers that do not match
+        # and do not exit in common include should retain in their respective variant specific directories.
+        for variant, include_path in variant_includes.items():
+            for root, sub_dirs, header_files in os.walk(include_path):
+                sub_dir_root = os.path.relpath(root, include_path)
+                for header in header_files:
+                    variant_header = os.path.join(include_path, sub_dir_root, header)
+                    common_header = os.path.join(
+                        output_include_dir, sub_dir_root, header
+                    )
+                    if not os.path.exists(common_header) or not files_are_identical(
+                        variant_header, common_header
+                    ):
+                        out_dir = os.path.join(
+                            os.path.abspath(args.multilib_optimised_dir),
+                            target,
+                            variant,
+                            sub_dir_root,
+                            "include",
+                        )
+                        os.makedirs(out_dir, exist_ok=True)
+                        shutil.copy2(variant_header, os.path.join(out_dir, header))
+
+        # Step3: For each variant, the lib and share directories should be copied from the non-optimised multilib
+        # directory as it is.
+        for variant in variant_includes:
+            remaining_dirs = ["lib", "share"]
+            for folder in remaining_dirs:
+                src_dir = os.path.join(input_target_dir, variant, folder)
+                dst_dir = os.path.join(output_target_dir, variant, folder)
+                if os.path.exists(src_dir):
+                    # If destination exists, remove it first
+                    if os.path.exists(dst_dir):
+                        shutil.rmtree(dst_dir)
+                    os.makedirs(os.path.dirname(dst_dir), exist_ok=True)
+                    shutil.copytree(src_dir, dst_dir)
+                else:
+                    print(f"Warning: {src_dir} does not exist and will be skipped.")
+
+    # Step4: Copy multilib.yaml file as it is from the non-optimised multilib directoy.
+    src_yaml = os.path.join(args.multilib_non_optimised_dir, "multilib.yaml")
+    dst_yaml = os.path.join(args.multilib_optimised_dir, "multilib.yaml")
+    shutil.copy2(src_yaml, dst_yaml)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "multilib_non_optimised_dir",
+        help="CMake binary directory containing the non-optimised multilib headers",
+    )
+    parser.add_argument(
+        "multilib_optimised_dir",
+        help="CMake binary directory where the optimised multilib headers should be generated",
+    )
+    args = parser.parse_args()
+
+    extract_common_headers_for_targets(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/arm-software/embedded/arm-multilib/json/multilib.json b/arm-software/embedded/arm-multilib/json/multilib.json
index 066abe78cd22..934532fe9f08 100644
--- a/arm-software/embedded/arm-multilib/json/multilib.json
+++ b/arm-software/embedded/arm-multilib/json/multilib.json
@@ -121,43 +121,43 @@
             "libraries_supported": "picolibc,llvmlibc"
         },
         {
-            "variant": "armv4t_exn_rtti",
-            "json": "armv4t_exn_rtti.json",
+            "variant": "armv4t_exn_rtti_size",
+            "json": "armv4t_exn_rtti_size.json",
             "flags": "--target=armv4t-unknown-none-eabi -mfpu=none"
         },
         {
-            "variant": "armv4t",
-            "json": "armv4t.json",
+            "variant": "armv4t_size",
+            "json": "armv4t_size.json",
             "flags": "--target=armv4t-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti"
         },
         {
-            "variant": "armv5te_exn_rtti",
-            "json": "armv5te_exn_rtti.json",
+            "variant": "armv5te_exn_rtti_size",
+            "json": "armv5te_exn_rtti_size.json",
             "flags": "--target=armv5e-unknown-none-eabi -mfpu=none"
         },
         {
-            "variant": "armv5te",
-            "json": "armv5te.json",
+            "variant": "armv5te_size",
+            "json": "armv5te_size.json",
             "flags": "--target=armv5e-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti"
         },
         {
-            "variant": "armv6m_soft_nofp_exn_rtti",
-            "json": "armv6m_soft_nofp_exn_rtti.json",
+            "variant": "armv6m_soft_nofp_exn_rtti_size",
+            "json": "armv6m_soft_nofp_exn_rtti_size.json",
             "flags": "--target=thumbv6m-unknown-none-eabi -mfpu=none"
         },
         {
-            "variant": "armv6m_soft_nofp",
-            "json": "armv6m_soft_nofp.json",
+            "variant": "armv6m_soft_nofp_size",
+            "json": "armv6m_soft_nofp_size.json",
             "flags": "--target=thumbv6m-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti"
         },
         {
-            "variant": "armebv6m_soft_nofp_exn_rtti",
-            "json": "armebv6m_soft_nofp_exn_rtti.json",
+            "variant": "armebv6m_soft_nofp_exn_rtti_size",
+            "json": "armebv6m_soft_nofp_exn_rtti_size.json",
             "flags": "--target=thumbebv6m-unknown-none-eabi -mfpu=none"
         },
         {
-            "variant": "armebv6m_soft_nofp",
-            "json": "armebv6m_soft_nofp.json",
+            "variant": "armebv6m_soft_nofp_size",
+            "json": "armebv6m_soft_nofp_size.json",
             "flags": "--target=thumbebv6m-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti"
         },
         {
@@ -331,133 +331,133 @@
             "flags": "--target=armv7r-unknown-none-eabi -mfpu=vfpv3xd -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned",
-            "json": "armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned.json",
+            "variant": "armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size",
+            "json": "armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=fpv4-sp-d16 -munaligned-access"
         },
         {
-            "variant": "armv7m_soft_fpv4_sp_d16_exn_rtti",
-            "json": "armv7m_soft_fpv4_sp_d16_exn_rtti.json",
+            "variant": "armv7m_soft_fpv4_sp_d16_exn_rtti_size",
+            "json": "armv7m_soft_fpv4_sp_d16_exn_rtti_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=fpv4-sp-d16 -mno-unaligned-access"
         },
         {
-            "variant": "armv7m_soft_fpv4_sp_d16_unaligned",
-            "json": "armv7m_soft_fpv4_sp_d16_unaligned.json",
+            "variant": "armv7m_soft_fpv4_sp_d16_unaligned_size",
+            "json": "armv7m_soft_fpv4_sp_d16_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv7m_soft_fpv4_sp_d16",
-            "json": "armv7m_soft_fpv4_sp_d16.json",
+            "variant": "armv7m_soft_fpv4_sp_d16_size",
+            "json": "armv7m_soft_fpv4_sp_d16_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armebv7m_soft_fpv4_sp_d16_exn_rtti",
-            "json": "armebv7m_soft_fpv4_sp_d16_exn_rtti.json",
+            "variant": "armebv7m_soft_fpv4_sp_d16_exn_rtti_size",
+            "json": "armebv7m_soft_fpv4_sp_d16_exn_rtti_size.json",
             "flags": "--target=thumbebv7m-unknown-none-eabi -mfpu=fpv4-sp-d16"
         },
         {
-            "variant": "armebv7m_soft_fpv4_sp_d16",
-            "json": "armebv7m_soft_fpv4_sp_d16.json",
+            "variant": "armebv7m_soft_fpv4_sp_d16_size",
+            "json": "armebv7m_soft_fpv4_sp_d16_size.json",
             "flags": "--target=thumbebv7m-unknown-none-eabi -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti"
         },
         {
-            "variant": "armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned",
-            "json": "armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned.json",
+            "variant": "armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size",
+            "json": "armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabihf -mfpu=fpv4-sp-d16 -munaligned-access"
         },
         {
-            "variant": "armv7m_hard_fpv4_sp_d16_exn_rtti",
-            "json": "armv7m_hard_fpv4_sp_d16_exn_rtti.json",
+            "variant": "armv7m_hard_fpv4_sp_d16_exn_rtti_size",
+            "json": "armv7m_hard_fpv4_sp_d16_exn_rtti_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabihf -mfpu=fpv4-sp-d16 -mno-unaligned-access"
         },
         {
-            "variant": "armebv7m_hard_fpv4_sp_d16_exn_rtti",
-            "json": "armebv7m_hard_fpv4_sp_d16_exn_rtti.json",
+            "variant": "armebv7m_hard_fpv4_sp_d16_exn_rtti_size",
+            "json": "armebv7m_hard_fpv4_sp_d16_exn_rtti_size.json",
             "flags": "--target=thumbebv7m-unknown-none-eabihf -mfpu=fpv4-sp-d16"
         },
         {
-            "variant": "armv7m_hard_fpv4_sp_d16_unaligned",
-            "json": "armv7m_hard_fpv4_sp_d16_unaligned.json",
+            "variant": "armv7m_hard_fpv4_sp_d16_unaligned_size",
+            "json": "armv7m_hard_fpv4_sp_d16_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv7m_hard_fpv4_sp_d16",
-            "json": "armv7m_hard_fpv4_sp_d16.json",
+            "variant": "armv7m_hard_fpv4_sp_d16_size",
+            "json": "armv7m_hard_fpv4_sp_d16_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armebv7m_hard_fpv4_sp_d16",
-            "json": "armebv7m_hard_fpv4_sp_d16.json",
+            "variant": "armebv7m_hard_fpv4_sp_d16_size",
+            "json": "armebv7m_hard_fpv4_sp_d16_size.json",
             "flags": "--target=thumbebv7m-unknown-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti"
         },
         {
-            "variant": "armv7m_hard_fpv5_d16_exn_rtti_unaligned",
-            "json": "armv7m_hard_fpv5_d16_exn_rtti_unaligned.json",
+            "variant": "armv7m_hard_fpv5_d16_exn_rtti_unaligned_size",
+            "json": "armv7m_hard_fpv5_d16_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabihf -mfpu=fpv5-d16 -munaligned-access"
         },
         {
-            "variant": "armv7m_hard_fpv5_d16_unaligned",
-            "json": "armv7m_hard_fpv5_d16_unaligned.json",
+            "variant": "armv7m_hard_fpv5_d16_unaligned_size",
+            "json": "armv7m_hard_fpv5_d16_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv7m_soft_nofp_exn_rtti_unaligned",
-            "json": "armv7m_soft_nofp_exn_rtti_unaligned.json",
+            "variant": "armv7m_soft_nofp_exn_rtti_unaligned_size",
+            "json": "armv7m_soft_nofp_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=none -munaligned-access"
         },
         {
-            "variant": "armv7m_soft_nofp_exn_rtti",
-            "json": "armv7m_soft_nofp_exn_rtti.json",
+            "variant": "armv7m_soft_nofp_exn_rtti_size",
+            "json": "armv7m_soft_nofp_exn_rtti_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=none -mno-unaligned-access"
         },
         {
-            "variant": "armv7m_soft_nofp_unaligned",
-            "json": "armv7m_soft_nofp_unaligned.json",
+            "variant": "armv7m_soft_nofp_unaligned_size",
+            "json": "armv7m_soft_nofp_unaligned_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv7m_soft_nofp",
-            "json": "armv7m_soft_nofp.json",
+            "variant": "armv7m_soft_nofp_size",
+            "json": "armv7m_soft_nofp_size.json",
             "flags": "--target=thumbv7m-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armebv7m_soft_nofp_exn_rtti",
-            "json": "armebv7m_soft_nofp_exn_rtti.json",
+            "variant": "armebv7m_soft_nofp_exn_rtti_size",
+            "json": "armebv7m_soft_nofp_exn_rtti_size.json",
             "flags": "--target=thumbebv7m-unknown-none-eabi -mfpu=none"
         },
         {
-            "variant": "armebv7m_soft_nofp",
-            "json": "armebv7m_soft_nofp.json",
+            "variant": "armebv7m_soft_nofp_size",
+            "json": "armebv7m_soft_nofp_size.json",
             "flags": "--target=thumbebv7m-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti"
         },
         {
-            "variant": "armv8m.main_soft_nofp_exn_rtti_unaligned",
-            "json": "armv8m.main_soft_nofp_exn_rtti_unaligned.json",
+            "variant": "armv8m.main_soft_nofp_exn_rtti_unaligned_size",
+            "json": "armv8m.main_soft_nofp_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8m.main-unknown-none-eabi -mfpu=none -munaligned-access"
         },
         {
-            "variant": "armv8m.main_soft_nofp_unaligned",
-            "json": "armv8m.main_soft_nofp_unaligned.json",
+            "variant": "armv8m.main_soft_nofp_unaligned_size",
+            "json": "armv8m.main_soft_nofp_unaligned_size.json",
             "flags": "--target=thumbv8m.main-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8m.main_hard_fp_exn_rtti_unaligned",
-            "json": "armv8m.main_hard_fp_exn_rtti_unaligned.json",
+            "variant": "armv8m.main_hard_fp_exn_rtti_unaligned_size",
+            "json": "armv8m.main_hard_fp_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8m.main-unknown-none-eabihf -mfpu=fpv5-sp-d16 -munaligned-access"
         },
         {
-            "variant": "armv8m.main_hard_fp_unaligned",
-            "json": "armv8m.main_hard_fp_unaligned.json",
+            "variant": "armv8m.main_hard_fp_unaligned_size",
+            "json": "armv8m.main_hard_fp_unaligned_size.json",
             "flags": "--target=thumbv8m.main-unknown-none-eabihf -mfpu=fpv5-sp-d16 -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned",
-            "json": "armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned.json",
+            "variant": "armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabi -mfpu=none -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_soft_nofp_nomve_unaligned",
-            "json": "armv8.1m.main_soft_nofp_nomve_unaligned.json",
+            "variant": "armv8.1m.main_soft_nofp_nomve_unaligned_size",
+            "json": "armv8.1m.main_soft_nofp_nomve_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabi -mfpu=none -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
@@ -465,29 +465,54 @@
             "json": "armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -munaligned-access"
         },
+        {
+            "variant": "armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size.json",
+            "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -munaligned-access -Os"
+        },
         {
             "variant": "armv8.1m.main_hard_fp_nomve_unaligned",
             "json": "armv8.1m.main_hard_fp_nomve_unaligned.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -munaligned-access"
         },
+        {
+            "variant": "armv8.1m.main_hard_fp_nomve_unaligned_size",
+            "json": "armv8.1m.main_hard_fp_nomve_unaligned_size.json",
+            "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -munaligned-access -Os"
+        },
         {
             "variant": "armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned",
             "json": "armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -munaligned-access"
         },
+        {
+            "variant": "armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size.json",
+            "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -munaligned-access -Os"
+        },
         {
             "variant": "armv8.1m.main_hard_fpdp_nomve_unaligned",
             "json": "armv8.1m.main_hard_fpdp_nomve_unaligned.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -munaligned-access"
         },
+        {
+            "variant": "armv8.1m.main_hard_fpdp_nomve_unaligned_size",
+            "json": "armv8.1m.main_hard_fpdp_nomve_unaligned_size.json",
+            "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -munaligned-access -Os"
+        },
         {
             "variant": "armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned",
             "json": "armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_nofp_mve_exn_rtti",
-            "json": "armv8.1m.main_hard_nofp_mve_exn_rtti.json",
+            "variant": "armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size.json",
+            "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -munaligned-access -Os"
+        },
+        {
+            "variant": "armv8.1m.main_hard_nofp_mve_exn_rtti_size",
+            "json": "armv8.1m.main_hard_nofp_mve_exn_rtti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -mno-unaligned-access"
         },
         {
@@ -496,8 +521,13 @@
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_nofp_mve",
-            "json": "armv8.1m.main_hard_nofp_mve.json",
+            "variant": "armv8.1m.main_hard_nofp_mve_unaligned_size",
+            "json": "armv8.1m.main_hard_nofp_mve_unaligned_size.json",
+            "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -munaligned-access -Os"
+        },
+        {
+            "variant": "armv8.1m.main_hard_nofp_mve_size",
+            "json": "armv8.1m.main_hard_nofp_mve_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
@@ -506,83 +536,83 @@
             "error": "No library available for MVE with soft-float ABI. Try -mfloat-abi=hard."
         },
         {
-            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned",
-            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned.json",
+            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabi -mfpu=none -mbranch-protection=pac-ret+bti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti",
-            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti.json",
+            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size",
+            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabi -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned",
-            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned.json",
+            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size",
+            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabi -mfpu=none -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti",
-            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti.json",
+            "variant": "armv8.1m.main_soft_nofp_nomve_pacret_bti_size",
+            "json": "armv8.1m.main_soft_nofp_nomve_pacret_bti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabi -mfpu=none -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned",
-            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned.json",
+            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti",
-            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti.json",
+            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size",
+            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned",
-            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned.json",
+            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size",
+            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti",
-            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti.json",
+            "variant": "armv8.1m.main_hard_fp_nomve_pacret_bti_size",
+            "json": "armv8.1m.main_hard_fp_nomve_pacret_bti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned",
-            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned.json",
+            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti",
-            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti.json",
+            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size",
+            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned",
-            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned.json",
+            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size",
+            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti",
-            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti.json",
+            "variant": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_size",
+            "json": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+fp16 -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned",
-            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned.json",
+            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size",
+            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti",
-            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti.json",
+            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size",
+            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned",
-            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned.json",
+            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size",
+            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -munaligned-access"
         },
         {
-            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti",
-            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti.json",
+            "variant": "armv8.1m.main_hard_nofp_mve_pacret_bti_size",
+            "json": "armv8.1m.main_hard_nofp_mve_pacret_bti_size.json",
             "flags": "--target=thumbv8.1m.main-unknown-none-eabihf -march=thumbv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -fno-exceptions -fno-rtti -mno-unaligned-access"
         }
     ]
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_exn_rtti_size.json
index 46b9df4e1b2a..50a1555a3dab 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv6m",
-            "VARIANT": "armebv6m_soft_nofp_exn_rtti",
+            "VARIANT": "armebv6m_soft_nofp_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv6m -mbig-endian -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp.json b/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_size.json
similarity index 96%
rename from arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_size.json
index 437251c7866e..416cd907751c 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv6m_soft_nofp_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv6m",
-            "VARIANT": "armebv6m_soft_nofp",
+            "VARIANT": "armebv6m_soft_nofp_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv6m -mbig-endian -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_exn_rtti_size.json
index 601ff5978d53..a7aebb67506f 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armebv7m_hard_fpv4_sp_d16_exn_rtti",
+            "VARIANT": "armebv7m_hard_fpv4_sp_d16_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16.json b/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_size.json
index f8cd0d479267..f20224110436 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv7m_hard_fpv4_sp_d16_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armebv7m_hard_fpv4_sp_d16",
+            "VARIANT": "armebv7m_hard_fpv4_sp_d16_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_exn_rtti_size.json
index cb0aa7413bc7..cb0df2aa5155 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armebv7m_soft_fpv4_sp_d16_exn_rtti",
+            "VARIANT": "armebv7m_soft_fpv4_sp_d16_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=softfp -march=armv7m -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16.json b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_size.json
index 756a1aa9404a..1ee8e406d4af 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_fpv4_sp_d16_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armebv7m_soft_fpv4_sp_d16",
+            "VARIANT": "armebv7m_soft_fpv4_sp_d16_size",
             "COMPILE_FLAGS": "-mfloat-abi=softfp -march=armv7m -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_exn_rtti_size.json
index f54c045e7bfe..bb6143d6ade9 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armebv7m_soft_nofp_exn_rtti",
+            "VARIANT": "armebv7m_soft_nofp_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv7m -mbig-endian -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp.json b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_size.json
similarity index 96%
rename from arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp.json
rename to arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_size.json
index f624abf63a25..003465ac6351 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armebv7m_soft_nofp_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armebv7m_soft_nofp",
+            "VARIANT": "armebv7m_soft_nofp_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv7m -mbig-endian -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv4t_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv4t_exn_rtti_size.json
similarity index 96%
rename from arm-software/embedded/arm-multilib/json/variants/armv4t_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv4t_exn_rtti_size.json
index 136de4715206..f7c6a799dc46 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv4t_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv4t_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv4t",
-            "VARIANT": "armv4t_exn_rtti",
+            "VARIANT": "armv4t_exn_rtti_size",
             "COMPILE_FLAGS": "-march=armv4t -mfpu=none",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv4t.json b/arm-software/embedded/arm-multilib/json/variants/armv4t_size.json
similarity index 97%
rename from arm-software/embedded/arm-multilib/json/variants/armv4t.json
rename to arm-software/embedded/arm-multilib/json/variants/armv4t_size.json
index 40153497d1b9..4cfc39e23707 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv4t.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv4t_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv4t",
-            "VARIANT": "armv4t",
+            "VARIANT": "armv4t_size",
             "COMPILE_FLAGS": "-march=armv4t -mfpu=none",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv5te_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv5te_exn_rtti_size.json
similarity index 96%
rename from arm-software/embedded/arm-multilib/json/variants/armv5te_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv5te_exn_rtti_size.json
index 23e9399c1b96..a84b9b3a5531 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv5te_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv5te_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv5te",
-            "VARIANT": "armv5te_exn_rtti",
+            "VARIANT": "armv5te_exn_rtti_size",
             "COMPILE_FLAGS": "-march=armv5te -mfpu=none",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv5te.json b/arm-software/embedded/arm-multilib/json/variants/armv5te_size.json
similarity index 97%
rename from arm-software/embedded/arm-multilib/json/variants/armv5te.json
rename to arm-software/embedded/arm-multilib/json/variants/armv5te_size.json
index 7e374e5f8d58..ce98b44f1371 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv5te.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv5te_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv5te",
-            "VARIANT": "armv5te",
+            "VARIANT": "armv5te_size",
             "COMPILE_FLAGS": "-march=armv5te -mfpu=none",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_exn_rtti_size.json
index a6fe9bc4125a..c8ba5f9e3474 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv6m",
-            "VARIANT": "armv6m_soft_nofp_exn_rtti",
+            "VARIANT": "armv6m_soft_nofp_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv6m -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp.json b/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_size.json
similarity index 96%
rename from arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp.json
rename to arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_size.json
index 897810e28875..b7e8c09e488a 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv6m_soft_nofp_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv6m",
-            "VARIANT": "armv6m_soft_nofp",
+            "VARIANT": "armv6m_soft_nofp_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv6m -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_size.json
index 6bcec9b50468..0a62a4bb54e6 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_hard_fpv4_sp_d16_exn_rtti",
+            "VARIANT": "armv7m_hard_fpv4_sp_d16_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size.json
similarity index 99%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size.json
index 5d90626f3456..7e72409b9159 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned",
+            "VARIANT": "armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mfpu=fpv4-sp-d16",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_size.json
index 793f603e5411..8154a83708a8 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_hard_fpv4_sp_d16",
+            "VARIANT": "armv7m_hard_fpv4_sp_d16_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_unaligned_size.json
index 250eec8ff1ab..0e8c2cdde88b 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv4_sp_d16_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_hard_fpv4_sp_d16_unaligned",
+            "VARIANT": "armv7m_hard_fpv4_sp_d16_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mfpu=fpv4-sp-d16",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_exn_rtti_unaligned_size.json
index ba4849d6e1d9..405d9997bfd3 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_hard_fpv5_d16_exn_rtti_unaligned",
+            "VARIANT": "armv7m_hard_fpv5_d16_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mfpu=fpv5-d16",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_unaligned_size.json
index 789e20a558b6..bf5dd4d7b5f2 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_hard_fpv5_d16_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_hard_fpv5_d16_unaligned",
+            "VARIANT": "armv7m_hard_fpv5_d16_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv7m -mfpu=fpv5-d16",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_size.json
index 9f21953f0f1d..fec8b4ef724a 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_fpv4_sp_d16_exn_rtti",
+            "VARIANT": "armv7m_soft_fpv4_sp_d16_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=softfp -march=armv7m -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size.json
similarity index 99%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size.json
index 7001cb71797a..9c4e23c13aa0 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned",
+            "VARIANT": "armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=softfp -march=armv7m -mfpu=fpv4-sp-d16",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_size.json
index 81f3a07f9b30..e893d1787e51 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_fpv4_sp_d16",
+            "VARIANT": "armv7m_soft_fpv4_sp_d16_size",
             "COMPILE_FLAGS": "-mfloat-abi=softfp -march=armv7m -mfpu=fpv4-sp-d16 -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_unaligned_size.json
index 1fbe1d701780..51c2bf0ec2dc 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_fpv4_sp_d16_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_fpv4_sp_d16_unaligned",
+            "VARIANT": "armv7m_soft_fpv4_sp_d16_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=softfp -march=armv7m -mfpu=fpv4-sp-d16",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_size.json
index b57365a2409b..931559275e9d 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_nofp_exn_rtti",
+            "VARIANT": "armv7m_soft_nofp_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv7m -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_unaligned_size.json
index c2359656771f..0106f2406525 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_nofp_exn_rtti_unaligned",
+            "VARIANT": "armv7m_soft_nofp_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv7m -mfpu=none",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_size.json
similarity index 96%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_size.json
index 95414818551a..56aa638c0d86 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_nofp",
+            "VARIANT": "armv7m_soft_nofp_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv7m -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_unaligned_size.json
index 23330c177dce..47685fa6843c 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv7m_soft_nofp_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv7m",
-            "VARIANT": "armv7m_soft_nofp_unaligned",
+            "VARIANT": "armv7m_soft_nofp_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv7m -mfpu=none",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size.json
new file mode 100644
index 000000000000..c28af26d38c2
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size.json
@@ -0,0 +1,40 @@
+{
+    "args": {
+        "common": {
+            "TARGET_ARCH": "armv8.1m.main",
+            "VARIANT": "armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size",
+            "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-sp-d16",
+            "ENABLE_EXCEPTIONS": "ON",
+            "ENABLE_RTTI": "ON",
+            "TEST_EXECUTOR": "qemu",
+            "QEMU_MACHINE": "mps3-an547",
+            "QEMU_CPU": "cortex-m55",
+            "BOOT_FLASH_ADDRESS": "0x00000000",
+            "BOOT_FLASH_SIZE": "512K",
+            "FLASH_ADDRESS": "0x60000000",
+            "FLASH_SIZE": "0x1000000",
+            "RAM_ADDRESS": "0x61000000",
+            "RAM_SIZE": "0x1000000",
+            "STACK_SIZE": "4K",
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
+        },
+        "picolibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "ON",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "newlib": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "llvmlibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        }
+    }
+}
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size.json
similarity index 98%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size.json
index 4cdb97fb0719..23476b0f3f9d 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti",
+            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size.json
index cc6712f715f7..f6cb0705745f 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned",
+            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_size.json
index 0bc660eb0fca..48a0eec0d7dc 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti",
+            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size.json
index 2fe66c463b77..09afeadaa76a 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned",
+            "VARIANT": "armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_unaligned_size.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_unaligned_size.json
new file mode 100644
index 000000000000..e1912266a32e
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fp_nomve_unaligned_size.json
@@ -0,0 +1,40 @@
+{
+    "args": {
+        "common": {
+            "TARGET_ARCH": "armv8.1m.main",
+            "VARIANT": "armv8.1m.main_hard_fp_nomve_unaligned_size",
+            "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-sp-d16",
+            "ENABLE_EXCEPTIONS": "OFF",
+            "ENABLE_RTTI": "OFF",
+            "TEST_EXECUTOR": "qemu",
+            "QEMU_MACHINE": "mps3-an547",
+            "QEMU_CPU": "cortex-m55",
+            "BOOT_FLASH_ADDRESS": "0x00000000",
+            "BOOT_FLASH_SIZE": "512K",
+            "FLASH_ADDRESS": "0x60000000",
+            "FLASH_SIZE": "0x1000000",
+            "RAM_ADDRESS": "0x61000000",
+            "RAM_SIZE": "0x1000000",
+            "STACK_SIZE": "4K",
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
+        },
+        "picolibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "ON",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "newlib": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "llvmlibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        }
+    }
+}
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size.json
new file mode 100644
index 000000000000..c18458545a17
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size.json
@@ -0,0 +1,40 @@
+{
+    "args": {
+        "common": {
+            "TARGET_ARCH": "armv8.1m.main",
+            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size",
+            "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-d16",
+            "ENABLE_EXCEPTIONS": "ON",
+            "ENABLE_RTTI": "ON",
+            "TEST_EXECUTOR": "qemu",
+            "QEMU_MACHINE": "mps3-an547",
+            "QEMU_CPU": "cortex-m55",
+            "BOOT_FLASH_ADDRESS": "0x00000000",
+            "BOOT_FLASH_SIZE": "512K",
+            "FLASH_ADDRESS": "0x60000000",
+            "FLASH_SIZE": "0x1000000",
+            "RAM_ADDRESS": "0x61000000",
+            "RAM_SIZE": "0x1000000",
+            "STACK_SIZE": "4K",
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
+        },
+        "picolibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "ON",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "newlib": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "llvmlibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        }
+    }
+}
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size.json
similarity index 98%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size.json
index 317337731989..57c478bb2a9f 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti",
+            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size.json
index 5dd9aa36bf02..d1a7310e986d 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned",
+            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_size.json
similarity index 99%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_size.json
index 948a6c0cc9ec..cdeebdf1f607 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti",
+            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size.json
index 4ff62a84f79d..db67c6b714eb 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned",
+            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_unaligned_size.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_unaligned_size.json
new file mode 100644
index 000000000000..b49f2cbefe18
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_fpdp_nomve_unaligned_size.json
@@ -0,0 +1,40 @@
+{
+    "args": {
+        "common": {
+            "TARGET_ARCH": "armv8.1m.main",
+            "VARIANT": "armv8.1m.main_hard_fpdp_nomve_unaligned_size",
+            "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+nomve -mfpu=fp-armv8-fullfp16-d16",
+            "ENABLE_EXCEPTIONS": "OFF",
+            "ENABLE_RTTI": "OFF",
+            "TEST_EXECUTOR": "qemu",
+            "QEMU_MACHINE": "mps3-an547",
+            "QEMU_CPU": "cortex-m55",
+            "BOOT_FLASH_ADDRESS": "0x00000000",
+            "BOOT_FLASH_SIZE": "512K",
+            "FLASH_ADDRESS": "0x60000000",
+            "FLASH_SIZE": "0x1000000",
+            "RAM_ADDRESS": "0x61000000",
+            "RAM_SIZE": "0x1000000",
+            "STACK_SIZE": "4K",
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
+        },
+        "picolibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "ON",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "newlib": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "llvmlibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        }
+    }
+}
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti_size.json
index 95284992695b..cdc2cfebf3f1 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_nofp_mve_exn_rtti",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size.json
new file mode 100644
index 000000000000..97137206ffa6
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size.json
@@ -0,0 +1,40 @@
+{
+    "args": {
+        "common": {
+            "TARGET_ARCH": "armv8.1m.main",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size",
+            "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none",
+            "ENABLE_EXCEPTIONS": "ON",
+            "ENABLE_RTTI": "ON",
+            "TEST_EXECUTOR": "qemu",
+            "QEMU_MACHINE": "mps3-an547",
+            "QEMU_CPU": "cortex-m55",
+            "BOOT_FLASH_ADDRESS": "0x00000000",
+            "BOOT_FLASH_SIZE": "512K",
+            "FLASH_ADDRESS": "0x60000000",
+            "FLASH_SIZE": "0x1000000",
+            "RAM_ADDRESS": "0x61000000",
+            "RAM_SIZE": "0x1000000",
+            "STACK_SIZE": "4K",
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
+        },
+        "picolibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "ON",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "newlib": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "llvmlibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        }
+    }
+}
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size.json
similarity index 98%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size.json
index f13f6e0eed64..dea8c68e38c8 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size.json
index d368162faab9..2e80a4033803 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_size.json
index 8db8bac5b26d..1eb33c1637ec 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size.json
index 82a637446b73..5df7022d5573 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_size.json
index 33fa7d7305f8..1947e1561444 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_hard_nofp_mve",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_unaligned_size.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_unaligned_size.json
new file mode 100644
index 000000000000..1b8a9b780da1
--- /dev/null
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_hard_nofp_mve_unaligned_size.json
@@ -0,0 +1,40 @@
+{
+    "args": {
+        "common": {
+            "TARGET_ARCH": "armv8.1m.main",
+            "VARIANT": "armv8.1m.main_hard_nofp_mve_unaligned_size",
+            "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8.1m.main+mve -mfpu=none",
+            "ENABLE_EXCEPTIONS": "OFF",
+            "ENABLE_RTTI": "OFF",
+            "TEST_EXECUTOR": "qemu",
+            "QEMU_MACHINE": "mps3-an547",
+            "QEMU_CPU": "cortex-m55",
+            "BOOT_FLASH_ADDRESS": "0x00000000",
+            "BOOT_FLASH_SIZE": "512K",
+            "FLASH_ADDRESS": "0x60000000",
+            "FLASH_SIZE": "0x1000000",
+            "RAM_ADDRESS": "0x61000000",
+            "RAM_SIZE": "0x1000000",
+            "STACK_SIZE": "4K",
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
+        },
+        "picolibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "ON",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "newlib": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        },
+        "llvmlibc": {
+            "ENABLE_CXX_LIBS": "ON",
+            "ENABLE_LIBC_TESTS": "OFF",
+            "ENABLE_COMPILER_RT_TESTS": "OFF",
+            "ENABLE_LIBCXX_TESTS": "OFF"
+        }
+    }
+}
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size.json
index 3e97db8b2284..b007fc433f8e 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned",
+            "VARIANT": "armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8.1m.main+nomve -mfpu=none",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size.json
similarity index 98%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size.json
index 21a5151e1387..011c1043cfa2 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti",
+            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8.1m.main+nomve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size.json
index 7ec5c66c41b1..175cdf6d5e28 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned",
+            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8.1m.main+nomve -mfpu=none -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_size.json
similarity index 99%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_size.json
index c1b652d101ad..b7c1e355722e 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti",
+            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8.1m.main+nomve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size.json
index d129d836e0da..9fd4b3fd53b3 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned",
+            "VARIANT": "armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8.1m.main+nomve -mfpu=none -mbranch-protection=pac-ret+bti",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_unaligned_size.json
similarity index 91%
rename from arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_unaligned_size.json
index af2b9f4c1ebf..47affa657249 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8.1m.main_soft_nofp_nomve_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8.1m.main",
-            "VARIANT": "armv8.1m.main_soft_nofp_nomve_unaligned",
+            "VARIANT": "armv8.1m.main_soft_nofp_nomve_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8.1m.main+nomve -mfpu=none",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
@@ -16,7 +16,7 @@
             "RAM_ADDRESS": "0x61000000",
             "RAM_SIZE": "0x1000000",
             "STACK_SIZE": "4K",
-            "LIBRARY_BUILD_TYPE": "release"
+            "LIBRARY_BUILD_TYPE": "minsizerelease"
         },
         "picolibc": {
             "ENABLE_CXX_LIBS": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_exn_rtti_unaligned_size.json
similarity index 94%
rename from arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_exn_rtti_unaligned_size.json
index 4d45a707f419..4d693cc95b1a 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8m.main",
-            "VARIANT": "armv8m.main_hard_fp_exn_rtti_unaligned",
+            "VARIANT": "armv8m.main_hard_fp_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8m.main -mfpu=fpv5-sp-d16",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_unaligned_size.json
index bcad44dbd36f..e87a5b70b397 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_hard_fp_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8m.main",
-            "VARIANT": "armv8m.main_hard_fp_unaligned",
+            "VARIANT": "armv8m.main_hard_fp_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=hard -march=armv8m.main -mfpu=fpv5-sp-d16",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_exn_rtti_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_exn_rtti_unaligned_size.json
similarity index 99%
rename from arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_exn_rtti_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_exn_rtti_unaligned_size.json
index c685a092c22c..8be93aec212d 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_exn_rtti_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_exn_rtti_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8m.main",
-            "VARIANT": "armv8m.main_soft_nofp_exn_rtti_unaligned",
+            "VARIANT": "armv8m.main_soft_nofp_exn_rtti_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8m.main -mfpu=none",
             "ENABLE_EXCEPTIONS": "ON",
             "ENABLE_RTTI": "ON",
diff --git a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_unaligned.json b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_unaligned_size.json
similarity index 95%
rename from arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_unaligned.json
rename to arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_unaligned_size.json
index ca7223bc274d..27f51f012b6e 100644
--- a/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_unaligned.json
+++ b/arm-software/embedded/arm-multilib/json/variants/armv8m.main_soft_nofp_unaligned_size.json
@@ -2,7 +2,7 @@
     "args": {
         "common": {
             "TARGET_ARCH": "armv8m.main",
-            "VARIANT": "armv8m.main_soft_nofp_unaligned",
+            "VARIANT": "armv8m.main_soft_nofp_unaligned_size",
             "COMPILE_FLAGS": "-mfloat-abi=soft -march=armv8m.main -mfpu=none",
             "ENABLE_EXCEPTIONS": "OFF",
             "ENABLE_RTTI": "OFF",
diff --git a/arm-software/embedded/arm-multilib/multilib.yaml.in b/arm-software/embedded/arm-multilib/multilib.yaml.in
index 0ee3da373185..4af1c2e24d74 100644
--- a/arm-software/embedded/arm-multilib/multilib.yaml.in
+++ b/arm-software/embedded/arm-multilib/multilib.yaml.in
@@ -192,3 +192,7 @@ Mappings:
 - Match: -mbranch-protection=(standard|pac-ret(\+leaf)?(\+bti)?|bti(\+pac-ret(\+leaf)?)?)
   Flags:
   - -mbranch-protection=pac-ret+bti
+
+- Match: -Oz
+  Flags:
+  - -Os
diff --git a/arm-software/embedded/docs/building-from-source.md b/arm-software/embedded/docs/building-from-source.md
index 11779304d9cb..c2feda47516e 100644
--- a/arm-software/embedded/docs/building-from-source.md
+++ b/arm-software/embedded/docs/building-from-source.md
@@ -27,8 +27,9 @@ Testing with QEMU is enabled by default, but can be disabled using the
 `-DENABLE_QEMU_TESTING=OFF` CMake option if testing is not required or QEMU is
 not installed.
 
-Some recent targets are not supported by QEMU, for these the Arm FVP models are
-used instead. These models are available free-of-change but are not
+Some recent targets are not supported by QEMU, for these the
+Arm Fixed Virtual Platform (FVP) models are
+used instead. These models are available free-of-charge but are not
 open-source, and come with their own licenses.
 
 These models can be downloaded and installed (into the source tree) with the
@@ -149,6 +150,23 @@ The same build directory can be used for both native and MinGW toolchains.
 ## Known limitations
 * Depending on the state of the sources, build errors may occur when
   the latest revisions of the llvm-project & picolibc repos are used.
+* Undefined `__aeabi_mem*` symbols with `-nostdlib`
+  When the `COMPILER_RT_EXCLUDE_LIBC_PROVIDED_ARM_AEABI_BUILTINS` flag is enabled,
+  the following ARM AEABI memory builtins are excluded from the ATFE `compiler-rt` build:
+  ```
+    __aeabi_memcmp
+    __aeabi_memset
+    __aeabi_memcpy
+    __aeabi_memmove
+  ```
+  This flag `COMPILER_RT_EXCLUDE_LIBC_PROVIDED_ARM_AEABI_BUILTINS` is enabled by default when
+  using picolibc and newlib as these functions are provided by the C library in both cases.
+  However, if the toolchain is used with `-nostdlib` and the C library is not linked in, users relying
+  solely on `compiler-rt` will have undefined symbol errors for these AEABI functions. To prevent the
+  generation of these AEABI function calls by the compiler, pass the following option to the compiler:
+  ```
+  -meabi gnu
+  ```
 
 ## Divergences from upstream
 
@@ -253,4 +271,4 @@ using specific test targets:
 `ninja check-unwind`
 
 Alternatively, `ninja check-all` runs all enabled tests.
-`ninja check-<VARAINT_NAME>` runs all the tests for that specific variant.
+`ninja check-<VARIANT_NAME>` runs all the tests for that specific variant.
diff --git a/arm-software/embedded/docs/llvmlibc.md b/arm-software/embedded/docs/llvmlibc.md
index b060d40b45d8..d77c30baa3f9 100644
--- a/arm-software/embedded/docs/llvmlibc.md
+++ b/arm-software/embedded/docs/llvmlibc.md
@@ -54,9 +54,9 @@ following command line options, in addition to `--target`, `-march` or
   `__llvm_libc_heap_limit` in addition to whatever other memory layout
   you want.
 
-* LLVM libc does not define errno. If you are using a function that
-  sets errno then you must implement the function `int *__llvm_libc_errno()`
-  that returns the address of your definition of errno.
+* LLVM libc does not define `errno`. If you are using a function that
+  sets `errno` then you must implement the function `int *__llvm_libc_errno()`
+  that returns the address of your definition of `errno`.
 
 For example:
 
@@ -74,6 +74,6 @@ directory containing sample programs that use LLVM libc.
 At present, this toolchain builds C++ libraries limited to what is supported with
 LLVM libc, for example, iostream is not available.
 
-At the time of writing this (2024-07), LLVM libc is a work in
+LLVM libc is a work in
 progress. It is incomplete: not all standard C library functionality
-is provided.
+is provided yet.
diff --git a/arm-software/embedded/llvmlibc-samples/src/cpp-baremetal-semihosting-llvmlibc/make.bat b/arm-software/embedded/llvmlibc-samples/src/cpp-baremetal-semihosting-llvmlibc/make.bat
index 7913aa7e3531..ab940734296e 100644
--- a/arm-software/embedded/llvmlibc-samples/src/cpp-baremetal-semihosting-llvmlibc/make.bat
+++ b/arm-software/embedded/llvmlibc-samples/src/cpp-baremetal-semihosting-llvmlibc/make.bat
@@ -40,6 +40,6 @@ if exist hello.hex del /q hello.hex
 @exit /B 1
 
 :build_fn
-%BIN_PATH%\clang++.exe --config=llvmlibc.cfg --target=armv6m-none-eabi -mfloat-abi=soft -march=armv6m -mfpu=none -nostartfiles -lcrt0-semihost -lsemihost -fno-exceptions -fno-rtti -g -T microbit-llvmlibc.ld -lm -o hello.elf hello.cpp
+%BIN_PATH%\clang++.exe --config=llvmlibc.cfg --target=armv6m-none-eabi -mfloat-abi=soft -march=armv6m -mfpu=none -nostartfiles -lcrt0-semihost -lsemihost -g -T microbit-llvmlibc.ld -lm -o hello.elf hello.cpp
 %BIN_PATH%\llvm-objcopy.exe -O ihex hello.elf hello.hex
 @exit /B
diff --git a/arm-software/embedded/patches/llvm-project/0008-Compiler-rt-FVP-and-QEMU-do-not-support-crash-signal.patch b/arm-software/embedded/patches/llvm-project/0008-Compiler-rt-FVP-and-QEMU-do-not-support-crash-signal.patch
new file mode 100644
index 000000000000..87effa4edb0e
--- /dev/null
+++ b/arm-software/embedded/patches/llvm-project/0008-Compiler-rt-FVP-and-QEMU-do-not-support-crash-signal.patch
@@ -0,0 +1,26 @@
+From 8a5771253787cda5c8caa31b7e8a870840903084 Mon Sep 17 00:00:00 2001
+From: Simi Pallipurath <simi.pallipurath@arm.com>
+Date: Tue, 22 Jul 2025 15:08:45 +0100
+Subject: [Compiler-rt] FVP and QEMU do not support crash signal handling.
+
+---
+ compiler-rt/test/builtins/Unit/aarch64/emupac.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/compiler-rt/test/builtins/Unit/aarch64/emupac.c b/compiler-rt/test/builtins/Unit/aarch64/emupac.c
+index 2140944b0eec..f3b8bb234f38 100644
+--- a/compiler-rt/test/builtins/Unit/aarch64/emupac.c
++++ b/compiler-rt/test/builtins/Unit/aarch64/emupac.c
+@@ -2,6 +2,9 @@
+ // RUN: %clang_builtins %s %librt -o %t
+ // RUN: %run %t 1
+ // RUN: %run %t 2
++// Some test runners(FVP and QEMU) can not handle crash-related signals.
++// TODO: A new LIT feature is needed to identify and handle this scenario.
++// XFAIL: *
+ // RUN: %expect_crash %run %t 3
+ // RUN: %expect_crash %run %t 4
+ 
+-- 
+2.34.1
+
diff --git a/arm-software/embedded/patches/llvm-project/0009-libc-NFC-Add-stdint.h-proxy-header-to-fix-dependency.patch b/arm-software/embedded/patches/llvm-project/0009-libc-NFC-Add-stdint.h-proxy-header-to-fix-dependency.patch
new file mode 100644
index 000000000000..7098824cfb97
--- /dev/null
+++ b/arm-software/embedded/patches/llvm-project/0009-libc-NFC-Add-stdint.h-proxy-header-to-fix-dependency.patch
@@ -0,0 +1,4576 @@
+From 4c8e38d1426e24a9c123e797f0225632a810e095 Mon Sep 17 00:00:00 2001
+From: lntue <lntue@google.com>
+Date: Thu, 24 Jul 2025 00:19:52 +0000
+Subject: [libc][NFC] Add stdint.h proxy header to fix dependency issue with
+ <stdint.h> includes. (#150303)
+
+https://github.com/llvm/llvm-project/issues/149993
+---
+ libc/benchmarks/gpu/BenchmarkLogger.cpp        |  3 +--
+ libc/benchmarks/gpu/CMakeLists.txt             |  1 +
+ libc/benchmarks/gpu/LibcGpuBenchmark.h         |  3 +--
+ libc/benchmarks/gpu/src/math/CMakeLists.txt    |  2 ++
+ .../gpu/timing/amdgpu/CMakeLists.txt           |  1 +
+ libc/benchmarks/gpu/timing/amdgpu/timing.h     |  3 +--
+ .../benchmarks/gpu/timing/nvptx/CMakeLists.txt |  1 +
+ libc/benchmarks/gpu/timing/nvptx/timing.h      |  3 +--
+ libc/config/CMakeLists.txt                     |  1 +
+ libc/config/gpu/app.h                          |  3 +--
+ libc/config/linux/app.h                        |  3 +--
+ libc/config/uefi/app.h                         |  3 +--
+ libc/hdr/CMakeLists.txt                        |  9 +++++++++
+ libc/hdr/stdint_proxy.h                        | 18 ++++++++++++++++++
+ libc/hdr/types/CMakeLists.txt                  |  2 ++
+ libc/include/CMakeLists.txt                    | 17 +++++++++--------
+ libc/src/__support/CMakeLists.txt              | 12 ++++++++++++
+ libc/src/__support/CPP/CMakeLists.txt          |  3 +++
+ libc/src/__support/CPP/bit.h                   |  3 +--
+ libc/src/__support/CPP/functional.h            |  3 +--
+ libc/src/__support/FPUtil/CMakeLists.txt       |  5 +++++
+ libc/src/__support/FPUtil/FPBits.h             |  3 +--
+ libc/src/__support/FPUtil/NormalFloat.h        |  3 +--
+ libc/src/__support/FPUtil/aarch64/FEnvImpl.h   |  2 +-
+ .../FPUtil/aarch64/fenv_darwin_impl.h          |  2 +-
+ libc/src/__support/FPUtil/arm/FEnvImpl.h       |  2 +-
+ libc/src/__support/FPUtil/bfloat16.h           |  3 +--
+ libc/src/__support/FPUtil/riscv/FEnvImpl.h     |  3 +--
+ libc/src/__support/FPUtil/x86_64/FEnvImpl.h    |  3 +--
+ .../FPUtil/x86_64/NextAfterLongDouble.h        |  3 +--
+ libc/src/__support/File/CMakeLists.txt         |  7 ++++---
+ libc/src/__support/File/file.h                 |  2 +-
+ libc/src/__support/File/linux/CMakeLists.txt   |  9 +++++----
+ libc/src/__support/File/linux/lseekImpl.h      |  2 +-
+ libc/src/__support/GPU/CMakeLists.txt          |  1 +
+ libc/src/__support/GPU/allocator.h             |  2 +-
+ libc/src/__support/HashTable/CMakeLists.txt    |  2 ++
+ libc/src/__support/HashTable/bitmask.h         |  2 +-
+ libc/src/__support/HashTable/table.h           |  2 +-
+ libc/src/__support/arg_list.h                  |  2 +-
+ libc/src/__support/big_int.h                   |  2 +-
+ libc/src/__support/block.h                     |  3 +--
+ libc/src/__support/blockstore.h                |  2 +-
+ libc/src/__support/detailed_powers_of_ten.h    |  3 +--
+ libc/src/__support/endian_internal.h           |  5 ++---
+ libc/src/__support/fixed_point/CMakeLists.txt  |  1 +
+ libc/src/__support/fixed_point/fx_rep.h        |  3 +--
+ libc/src/__support/float_to_string.h           |  3 +--
+ libc/src/__support/hash.h                      |  2 +-
+ libc/src/__support/high_precision_decimal.h    |  2 +-
+ libc/src/__support/integer_literals.h          |  2 +-
+ libc/src/__support/integer_to_string.h         |  3 +--
+ .../__support/macros/properties/CMakeLists.txt |  1 +
+ libc/src/__support/macros/properties/types.h   |  5 ++---
+ libc/src/__support/ryu_constants.h             |  2 +-
+ libc/src/__support/ryu_long_double_constants.h |  2 +-
+ libc/src/__support/str_to_float.h              |  3 +--
+ libc/src/__support/threads/CMakeLists.txt      |  2 ++
+ libc/src/__support/threads/CndVar.h            |  3 +--
+ .../src/__support/threads/linux/CMakeLists.txt |  3 +++
+ libc/src/__support/threads/linux/futex_word.h  |  2 +-
+ libc/src/__support/threads/linux/thread.cpp    |  2 +-
+ libc/src/__support/threads/thread.h            |  2 +-
+ libc/src/__support/wchar/CMakeLists.txt        |  3 ++-
+ libc/src/__support/wchar/mbstate.h             |  2 +-
+ libc/src/arpa/inet/CMakeLists.txt              |  4 ++++
+ libc/src/arpa/inet/htonl.h                     |  2 +-
+ libc/src/arpa/inet/htons.h                     |  2 +-
+ libc/src/arpa/inet/ntohl.h                     |  2 +-
+ libc/src/arpa/inet/ntohs.h                     |  2 +-
+ libc/src/compiler/generic/CMakeLists.txt       |  1 +
+ libc/src/compiler/generic/__stack_chk_fail.cpp |  2 +-
+ libc/src/inttypes/CMakeLists.txt               |  2 ++
+ libc/src/inttypes/strtoimax.h                  |  2 +-
+ libc/src/inttypes/strtoumax.h                  |  2 +-
+ libc/src/link/CMakeLists.txt                   |  2 ++
+ libc/src/math/generic/CMakeLists.txt           |  1 +
+ libc/src/math/generic/expxf16.h                |  3 +--
+ libc/src/pthread/CMakeLists.txt                |  1 +
+ libc/src/pthread/pthread_attr_setstack.cpp     |  2 +-
+ libc/src/sched/linux/CMakeLists.txt            |  1 +
+ libc/src/sched/linux/sched_getaffinity.cpp     |  2 +-
+ libc/src/spawn/CMakeLists.txt                  |  1 +
+ libc/src/spawn/file_actions.h                  |  2 +-
+ libc/src/stdio/gpu/CMakeLists.txt              |  1 +
+ libc/src/stdio/gpu/fgets.cpp                   |  3 +--
+ libc/src/stdlib/CMakeLists.txt                 |  9 +++++++--
+ libc/src/stdlib/a64l.cpp                       |  3 +--
+ libc/src/stdlib/bsearch.cpp                    |  2 +-
+ libc/src/stdlib/l64a.cpp                       |  3 +--
+ libc/src/stdlib/qsort.cpp                      |  3 +--
+ libc/src/stdlib/qsort_data.h                   |  3 +--
+ libc/src/stdlib/qsort_r.cpp                    |  3 +--
+ libc/src/stdlib/quick_sort.h                   |  3 +--
+ libc/src/string/CMakeLists.txt                 |  1 +
+ libc/src/string/memory_utils/CMakeLists.txt    |  1 +
+ libc/src/string/memory_utils/op_generic.h      |  3 +--
+ libc/src/string/memory_utils/utils.h           |  2 +-
+ .../string/memory_utils/x86_64/inline_memcpy.h |  2 +-
+ libc/src/string/string_utils.h                 |  1 +
+ libc/src/sys/stat/linux/CMakeLists.txt         |  1 +
+ libc/src/sys/stat/linux/kernel_statx.h         |  2 +-
+ libc/src/time/CMakeLists.txt                   |  4 +++-
+ libc/src/time/linux/CMakeLists.txt             |  1 +
+ libc/src/time/linux/nanosleep.cpp              |  2 +-
+ libc/src/time/strftime_core/CMakeLists.txt     |  1 +
+ libc/src/time/strftime_core/core_structs.h     |  3 +--
+ libc/src/time/time_constants.h                 |  2 +-
+ libc/src/time/time_utils.cpp                   |  3 +--
+ libc/src/time/time_utils.h                     |  3 +--
+ libc/src/unistd/linux/CMakeLists.txt           |  4 ++++
+ libc/src/unistd/linux/ftruncate.cpp            |  2 +-
+ libc/src/unistd/linux/pread.cpp                |  2 +-
+ libc/src/unistd/linux/pwrite.cpp               |  2 +-
+ libc/src/unistd/linux/truncate.cpp             |  2 +-
+ libc/startup/baremetal/CMakeLists.txt          |  6 ++++++
+ libc/startup/baremetal/fini.cpp                |  2 +-
+ libc/startup/baremetal/init.cpp                |  2 +-
+ libc/startup/linux/CMakeLists.txt              |  1 +
+ libc/startup/linux/do_start.cpp                |  2 +-
+ libc/test/IntegrationTest/CMakeLists.txt       |  1 +
+ libc/test/IntegrationTest/test.cpp             |  2 +-
+ libc/test/UnitTest/CMakeLists.txt              |  7 +++++++
+ libc/test/UnitTest/ExecuteFunction.h           |  2 +-
+ libc/test/UnitTest/HermeticTestUtils.cpp       |  2 +-
+ libc/test/UnitTest/PrintfMatcher.cpp           |  3 +--
+ libc/test/UnitTest/RoundingModeUtils.h         |  2 +-
+ libc/test/UnitTest/ScanfMatcher.cpp            |  3 +--
+ libc/test/UnitTest/TestLogger.cpp              |  7 +++----
+ .../integration/src/pthread/CMakeLists.txt     |  5 +++++
+ .../src/pthread/pthread_equal_test.cpp         |  3 +--
+ .../src/pthread/pthread_mutex_test.cpp         |  8 +++-----
+ .../src/pthread/pthread_name_test.cpp          |  3 +--
+ .../src/pthread/pthread_once_test.cpp          |  3 +--
+ .../integration/src/pthread/pthread_test.cpp   |  2 +-
+ libc/test/integration/src/spawn/CMakeLists.txt |  1 +
+ .../integration/src/spawn/posix_spawn_test.cpp |  2 +-
+ libc/test/src/CMakeLists.txt                   |  4 +++-
+ libc/test/src/__support/CMakeLists.txt         |  1 +
+ libc/test/src/__support/CPP/CMakeLists.txt     |  1 +
+ libc/test/src/__support/CPP/bit_test.cpp       |  3 +--
+ .../src/__support/HashTable/CMakeLists.txt     |  1 +
+ .../src/__support/HashTable/group_test.cpp     |  2 +-
+ .../__support/str_to_float_comparison_test.cpp |  2 +-
+ libc/test/src/fenv/feclearexcept_test.cpp      |  1 -
+ libc/test/src/math/LdExpTest.h                 |  2 +-
+ libc/test/src/math/acosf_test.cpp              |  3 +--
+ libc/test/src/math/acoshf16_test.cpp           |  2 +-
+ libc/test/src/math/acoshf_test.cpp             |  3 +--
+ libc/test/src/math/asinf_test.cpp              |  3 +--
+ libc/test/src/math/asinhf_test.cpp             |  3 +--
+ libc/test/src/math/atanf_test.cpp              |  3 +--
+ libc/test/src/math/atanhf16_test.cpp           |  2 +-
+ libc/test/src/math/atanhf_test.cpp             |  3 +--
+ libc/test/src/math/cosf_test.cpp               |  3 +--
+ libc/test/src/math/coshf_test.cpp              |  3 +--
+ libc/test/src/math/erff_test.cpp               |  2 +-
+ libc/test/src/math/exp10_test.cpp              |  2 +-
+ libc/test/src/math/exp10f_test.cpp             |  2 +-
+ libc/test/src/math/exp10m1f_test.cpp           |  2 +-
+ libc/test/src/math/exp2_test.cpp               |  2 +-
+ libc/test/src/math/exp2f_test.cpp              |  2 +-
+ libc/test/src/math/exp2m1f_test.cpp            |  2 +-
+ libc/test/src/math/exp_test.cpp                |  2 +-
+ libc/test/src/math/expf_test.cpp               |  2 +-
+ libc/test/src/math/expm1_test.cpp              |  2 +-
+ libc/test/src/math/expm1f_test.cpp             |  2 +-
+ .../test/src/math/in_float_range_test_helper.h |  2 +-
+ libc/test/src/math/log10_test.cpp              |  2 +-
+ libc/test/src/math/log10f_test.cpp             |  2 +-
+ libc/test/src/math/log1p_test.cpp              |  2 +-
+ libc/test/src/math/log1pf_test.cpp             |  2 +-
+ libc/test/src/math/log2_test.cpp               |  2 +-
+ libc/test/src/math/log2f_test.cpp              |  2 +-
+ libc/test/src/math/log_test.cpp                |  2 +-
+ libc/test/src/math/logf_test.cpp               |  2 +-
+ libc/test/src/math/performance_testing/Timer.h |  2 +-
+ .../math/performance_testing/fmodf16_perf.cpp  |  2 +-
+ libc/test/src/math/powf_test.cpp               |  2 +-
+ libc/test/src/math/sdcomp26094.h               |  3 +--
+ libc/test/src/math/sincosf_test.cpp            |  2 +-
+ libc/test/src/math/sinf_test.cpp               |  2 +-
+ libc/test/src/math/sinhf_test.cpp              |  2 +-
+ libc/test/src/math/sinpif_test.cpp             |  2 +-
+ libc/test/src/math/smoke/LdExpTest.h           |  3 +--
+ libc/test/src/math/smoke/acosf_test.cpp        |  2 +-
+ libc/test/src/math/smoke/acoshf_test.cpp       |  2 +-
+ libc/test/src/math/smoke/asinf_test.cpp        |  2 +-
+ libc/test/src/math/smoke/asinhf_test.cpp       |  2 +-
+ libc/test/src/math/smoke/atanf_test.cpp        |  2 +-
+ libc/test/src/math/smoke/atanhf_test.cpp       |  2 +-
+ libc/test/src/math/smoke/cosf_test.cpp         |  3 +--
+ libc/test/src/math/smoke/coshf_test.cpp        |  3 +--
+ libc/test/src/math/smoke/cospif_test.cpp       |  3 +--
+ libc/test/src/math/smoke/erff_test.cpp         |  3 +--
+ libc/test/src/math/smoke/exp10_test.cpp        |  3 +--
+ libc/test/src/math/smoke/exp10f_test.cpp       |  3 +--
+ libc/test/src/math/smoke/exp2_test.cpp         |  3 +--
+ libc/test/src/math/smoke/exp2f_test.cpp        |  3 +--
+ libc/test/src/math/smoke/exp_test.cpp          |  3 +--
+ libc/test/src/math/smoke/expf_test.cpp         |  3 +--
+ libc/test/src/math/smoke/expm1_test.cpp        |  3 +--
+ libc/test/src/math/smoke/expm1f_test.cpp       |  3 +--
+ libc/test/src/math/smoke/log10_test.cpp        |  3 +--
+ libc/test/src/math/smoke/log10f_test.cpp       |  3 +--
+ libc/test/src/math/smoke/log1pf_test.cpp       |  3 +--
+ libc/test/src/math/smoke/log2_test.cpp         |  3 +--
+ libc/test/src/math/smoke/log2f_test.cpp        |  3 +--
+ libc/test/src/math/smoke/log_test.cpp          |  3 +--
+ libc/test/src/math/smoke/logf_test.cpp         |  3 +--
+ libc/test/src/math/smoke/powf_test.cpp         |  3 +--
+ libc/test/src/math/smoke/sincosf_test.cpp      |  3 +--
+ libc/test/src/math/smoke/sinf_test.cpp         |  3 +--
+ libc/test/src/math/smoke/sinhf_test.cpp        |  3 +--
+ libc/test/src/math/smoke/sinpif_test.cpp       |  3 +--
+ libc/test/src/math/smoke/tanf_test.cpp         |  3 +--
+ libc/test/src/math/smoke/tanhf_test.cpp        |  3 +--
+ libc/test/src/math/tanf_test.cpp               |  2 +-
+ libc/test/src/math/tanhf_test.cpp              |  2 +-
+ libc/test/src/signal/CMakeLists.txt            |  1 +
+ libc/test/src/signal/sigaltstack_test.cpp      |  2 +-
+ libc/test/src/spawn/CMakeLists.txt             |  1 +
+ .../spawn/posix_spawn_file_actions_test.cpp    |  2 +-
+ libc/test/src/stdlib/CMakeLists.txt            |  3 +++
+ libc/test/src/stdlib/memalignment_test.cpp     |  3 +--
+ libc/test/src/stdlib/strtoint32_test.cpp       |  3 +--
+ libc/test/src/stdlib/strtoint64_test.cpp       |  3 +--
+ .../src/string/memory_utils/CMakeLists.txt     |  1 +
+ .../string/memory_utils/memory_check_utils.h   |  2 +-
+ .../src/string/memory_utils/protected_pages.h  |  2 +-
+ libc/utils/MPCWrapper/CMakeLists.txt           |  1 +
+ libc/utils/MPCWrapper/MPCUtils.cpp             |  3 +--
+ libc/utils/MPCWrapper/MPCUtils.h               |  3 +--
+ libc/utils/MPFRWrapper/CMakeLists.txt          |  2 ++
+ libc/utils/MPFRWrapper/MPCommon.h              |  3 +--
+ libc/utils/MPFRWrapper/MPFRUtils.h             |  3 +--
+ .../llvm-project-overlay/libc/BUILD.bazel      |  7 +++++++
+ .../libc/test/UnitTest/BUILD.bazel             |  4 ++++
+ .../libc/test/libc_test_rules.bzl              |  1 +
+ 239 files changed, 353 insertions(+), 295 deletions(-)
+ create mode 100644 libc/hdr/stdint_proxy.h
+
+diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp
+index 0d644fa3c37b..d5996a74f6dd 100644
+--- a/libc/benchmarks/gpu/BenchmarkLogger.cpp
++++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp
+@@ -1,4 +1,5 @@
+ #include "benchmarks/gpu/BenchmarkLogger.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/string.h"
+ #include "src/__support/CPP/string_view.h"
+ #include "src/__support/OSUtil/io.h"               // write_to_stderr
+@@ -7,8 +8,6 @@
+ #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+ #include "src/__support/uint128.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace benchmarks {
+ 
+diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt
+index b58f4fd8b1a4..6ec64bf270b5 100644
+--- a/libc/benchmarks/gpu/CMakeLists.txt
++++ b/libc/benchmarks/gpu/CMakeLists.txt
+@@ -45,6 +45,7 @@ add_unittest_framework_library(
+     LibcGpuBenchmark.h
+     BenchmarkLogger.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.big_int
+     libc.src.__support.c_string
+     libc.src.__support.CPP.string
+diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h
+index f2cfbfbfdcdf..a6cf62dd30ce 100644
+--- a/libc/benchmarks/gpu/LibcGpuBenchmark.h
++++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h
+@@ -3,6 +3,7 @@
+ 
+ #include "benchmarks/gpu/BenchmarkLogger.h"
+ #include "benchmarks/gpu/timing/timing.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/functional.h"
+ #include "src/__support/CPP/limits.h"
+@@ -13,8 +14,6 @@
+ #include "src/stdlib/rand.h"
+ #include "src/time/clock.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ namespace benchmarks {
+diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt
+index 6870c0244901..7a12ce4e61c9 100644
+--- a/libc/benchmarks/gpu/src/math/CMakeLists.txt
++++ b/libc/benchmarks/gpu/src/math/CMakeLists.txt
+@@ -31,6 +31,7 @@ add_benchmark(
+   SRCS
+     sin_benchmark.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.math.sin
+     libc.src.math.sinf
+     libc.src.stdlib.srand
+@@ -51,6 +52,7 @@ add_benchmark(
+   SRCS
+     atan2_benchmark.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.math.atan2
+     libc.src.stdlib.srand
+     libc.src.stdlib.rand
+diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+index aa5dcd33bee9..dd7c2d342f70 100644
+--- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
++++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt
+@@ -3,6 +3,7 @@ add_header_library(
+   HDRS
+     timing.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.macros.config
+     libc.src.__support.macros.attributes
+diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h
+index 4cf7e9838add..1eb711ba951e 100644
+--- a/libc/benchmarks/gpu/timing/amdgpu/timing.h
++++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+ #define LLVM_LIBC_UTILS_GPU_TIMING_AMDGPU
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/GPU/utils.h"
+@@ -16,8 +17,6 @@
+ #include "src/__support/macros/attributes.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // Returns the overhead associated with calling the profiling region. This
+diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+index 2723c8940814..a19c16ee4e44 100644
+--- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
++++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt
+@@ -3,6 +3,7 @@ add_header_library(
+   HDRS
+     timing.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.macros.config
+     libc.src.__support.macros.attributes
+diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h
+index ece7d9a6c539..e5e98b3ca087 100644
+--- a/libc/benchmarks/gpu/timing/nvptx/timing.h
++++ b/libc/benchmarks/gpu/timing/nvptx/timing.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+ #define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/GPU/utils.h"
+@@ -16,8 +17,6 @@
+ #include "src/__support/macros/attributes.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // Returns the overhead associated with calling the profiling region. This
+diff --git a/libc/config/CMakeLists.txt b/libc/config/CMakeLists.txt
+index cf38ae3eed72..4758276469f6 100644
+--- a/libc/config/CMakeLists.txt
++++ b/libc/config/CMakeLists.txt
+@@ -3,5 +3,6 @@ add_header_library(
+   HDRS
+     app.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+ )
+diff --git a/libc/config/gpu/app.h b/libc/config/gpu/app.h
+index 148c51b70220..17ef3ae7b1ce 100644
+--- a/libc/config/gpu/app.h
++++ b/libc/config/gpu/app.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_CONFIG_GPU_APP_H
+ #define LLVM_LIBC_CONFIG_GPU_APP_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/architectures.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // TODO: Move other global values here and export them to the host.
+diff --git a/libc/config/linux/app.h b/libc/config/linux/app.h
+index 188d34816454..f3d11da9fc14 100644
+--- a/libc/config/linux/app.h
++++ b/libc/config/linux/app.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_CONFIG_LINUX_APP_H
+ #define LLVM_LIBC_CONFIG_LINUX_APP_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/architectures.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // Data structure to capture properties of the linux/ELF TLS image.
+diff --git a/libc/config/uefi/app.h b/libc/config/uefi/app.h
+index 0374a47ba340..1f181ed5f9f1 100644
+--- a/libc/config/uefi/app.h
++++ b/libc/config/uefi/app.h
+@@ -9,13 +9,12 @@
+ #ifndef LLVM_LIBC_CONFIG_UEFI_APP_H
+ #define LLVM_LIBC_CONFIG_UEFI_APP_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "include/llvm-libc-types/EFI_HANDLE.h"
+ #include "include/llvm-libc-types/EFI_SYSTEM_TABLE.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/architectures.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // Data structure which captures properties of a UEFI application.
+diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
+index 052a773a4fce..5fc25d0ca568 100644
+--- a/libc/hdr/CMakeLists.txt
++++ b/libc/hdr/CMakeLists.txt
+@@ -243,5 +243,14 @@ add_proxy_header_library(
+     libc.include.llvm-libc-macros.offsetof_macro
+ )
+ 
++# stdint.h header.
++add_proxy_header_library(
++  stdint_proxy
++  HDRS
++    stdint_proxy.h
++  FULL_BUILD_DEPENDS
++    libc.include.stdint
++)
++
+ add_subdirectory(types)
+ add_subdirectory(func)
+diff --git a/libc/hdr/stdint_proxy.h b/libc/hdr/stdint_proxy.h
+new file mode 100644
+index 000000000000..8e815679a4e2
+--- /dev/null
++++ b/libc/hdr/stdint_proxy.h
+@@ -0,0 +1,18 @@
++//===-- stdint.h ----------------------------------------------------------===//
++//
++// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
++// See https://llvm.org/LICENSE.txt for license information.
++// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
++//
++//===----------------------------------------------------------------------===//
++
++#ifndef LLVM_LIBC_HDR_STDINT_PROXY_H
++#define LLVM_LIBC_HDR_STDINT_PROXY_H
++
++// This target is to make sure we have correct build order in full build mode,
++// that is `libc.include.stdint` is added to the dependency of all targets
++// that use <stdint.h> header.
++
++#include <stdint.h>
++
++#endif // LLVM_LIBC_HDR_STDINT_PROXY_H
+diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
+index e4b3cb0faa82..2dc71f9791e1 100644
+--- a/libc/hdr/types/CMakeLists.txt
++++ b/libc/hdr/types/CMakeLists.txt
+@@ -92,6 +92,7 @@ add_proxy_header_library(
+   DEPENDS
+     libc.hdr.fcntl_overlay
+   FULL_BUILD_DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.llvm-libc-types.struct_flock
+ )
+ 
+@@ -102,6 +103,7 @@ add_proxy_header_library(
+   DEPENDS
+     libc.hdr.fcntl_overlay
+   FULL_BUILD_DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.llvm-libc-types.struct_flock64
+ )
+ 
+diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
+index 55268d19529c..ce811d17784a 100644
+--- a/libc/include/CMakeLists.txt
++++ b/libc/include/CMakeLists.txt
+@@ -102,6 +102,14 @@ add_header_macro(
+     .llvm-libc-types.fexcept_t
+ )
+ 
++add_header_macro(
++  stdint
++  ../libc/include/stdint.yaml
++  stdint.h
++  DEPENDS
++    .llvm-libc-macros.stdint_macros
++)
++
+ add_header_macro(
+   inttypes
+   ../libc/include/inttypes.yaml
+@@ -110,6 +118,7 @@ add_header_macro(
+     .llvm_libc_common_h
+     .llvm-libc-types.imaxdiv_t
+     .llvm-libc-macros.inttypes_macros
++    .stdint
+ )
+ 
+ add_header_macro(
+@@ -120,14 +129,6 @@ add_header_macro(
+     .llvm-libc-macros.float_macros
+ )
+ 
+-add_header_macro(
+-  stdint
+-  ../libc/include/stdint.yaml
+-  stdint.h
+-  DEPENDS
+-    .llvm-libc-macros.stdint_macros
+-)
+-
+ add_header_macro(
+   limits
+   ../libc/include/limits.yaml
+diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
+index 37a27cc9b700..2196d9e23bba 100644
+--- a/libc/src/__support/CMakeLists.txt
++++ b/libc/src/__support/CMakeLists.txt
+@@ -15,6 +15,7 @@ add_header_library(
+   HDRS
+     block.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.algorithm
+     libc.src.__support.CPP.limits
+     libc.src.__support.CPP.new
+@@ -86,6 +87,7 @@ add_header_library(
+     blockstore.h
+   DEPENDS
+     .libc_assert
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.new
+ )
+ 
+@@ -97,6 +99,8 @@ add_header_library(
+     macros/properties/architectures.h
+     macros/attributes.h
+     macros/config.h
++  DEPENDS
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_header_library(
+@@ -199,6 +203,7 @@ add_header_library(
+     integer_to_string.h
+   DEPENDS
+     .big_int
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.CPP.algorithm
+     libc.src.__support.CPP.limits
+@@ -215,6 +220,7 @@ add_header_library(
+     ryu_long_double_constants.h
+   DEPENDS
+     .libc_assert
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.type_traits
+     libc.src.__support.FPUtil.fp_bits
+     libc.src.__support.common
+@@ -226,6 +232,7 @@ add_header_library(
+     high_precision_decimal.h
+   DEPENDS
+     .str_to_integer
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_header_library(
+@@ -240,6 +247,7 @@ add_header_library(
+     .str_to_num_result
+     .uint128
+     libc.hdr.errno_macros
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.limits
+@@ -257,6 +265,7 @@ add_header_library(
+     integer_literals.h
+   DEPENDS
+     .uint128
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.limits
+ )
+ 
+@@ -290,6 +299,7 @@ add_header_library(
+   HDRS
+     arg_list.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+ )
+ 
+@@ -329,6 +339,7 @@ add_header_library(
+   DEPENDS
+     .math_extras
+     .number_pair
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.array
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.type_traits
+@@ -361,6 +372,7 @@ add_header_library(
+     hash.h
+   DEPENDS
+     .uint128
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.limits
+     libc.src.__support.macros.attributes
+diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
+index d2ba00a5384d..8b65a8839ab2 100644
+--- a/libc/src/__support/CPP/CMakeLists.txt
++++ b/libc/src/__support/CPP/CMakeLists.txt
+@@ -17,6 +17,7 @@ add_header_library(
+   DEPENDS
+     .limits
+     .type_traits
++    libc.hdr.stdint_proxy
+     libc.src.__support.macros.attributes
+     libc.src.__support.macros.sanitizer
+ )
+@@ -39,6 +40,8 @@ add_header_library(
+   functional
+   HDRS
+     functional.h
++  DEPENDS
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_header_library(
+diff --git a/libc/src/__support/CPP/bit.h b/libc/src/__support/CPP/bit.h
+index e491f3e03266..df1b177ecb10 100644
+--- a/libc/src/__support/CPP/bit.h
++++ b/libc/src/__support/CPP/bit.h
+@@ -11,14 +11,13 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_BIT_H
+ #define LLVM_LIBC_SRC___SUPPORT_CPP_BIT_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/limits.h" // numeric_limits
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/macros/attributes.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/sanitizer.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace cpp {
+ 
+diff --git a/libc/src/__support/CPP/functional.h b/libc/src/__support/CPP/functional.h
+index 50cfa256b668..5c43d2236971 100644
+--- a/libc/src/__support/CPP/functional.h
++++ b/libc/src/__support/CPP/functional.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_FUNCTIONAL_H
+ #define LLVM_LIBC_SRC___SUPPORT_CPP_FUNCTIONAL_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/type_traits/enable_if.h"
+ #include "src/__support/CPP/type_traits/is_convertible.h"
+ #include "src/__support/CPP/type_traits/is_same.h"
+@@ -19,8 +20,6 @@
+ #include "src/__support/macros/attributes.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace cpp {
+ 
+diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
+index cc941f23135a..ffa0e075e383 100644
+--- a/libc/src/__support/FPUtil/CMakeLists.txt
++++ b/libc/src/__support/FPUtil/CMakeLists.txt
+@@ -6,6 +6,7 @@ add_header_library(
+     libc.hdr.types.fenv_t
+     libc.hdr.fenv_macros
+     libc.hdr.math_macros
++    libc.hdr.stdint_proxy
+     libc.src.__support.macros.attributes
+     libc.src.errno.errno
+ )
+@@ -27,6 +28,7 @@ add_header_library(
+   HDRS
+     FPBits.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.type_traits
+@@ -71,6 +73,7 @@ add_header_library(
+     NormalFloat.h
+   DEPENDS
+     .fp_bits
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.type_traits
+     libc.src.__support.common
+ )
+@@ -236,6 +239,7 @@ add_header_library(
+     .nearest_integer_operations
+     .normal_float
+     libc.hdr.math_macros
++    libc.hdr.stdint_proxy
+     libc.src.errno.errno
+     libc.src.__support.common
+     libc.src.__support.CPP.bit
+@@ -264,6 +268,7 @@ add_header_library(
+   DEPENDS
+     .cast
+     .dyadic_float
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.type_traits
+     libc.src.__support.macros.config
+diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
+index 9e21136558a0..2f695c158375 100644
+--- a/libc/src/__support/FPUtil/FPBits.h
++++ b/libc/src/__support/FPUtil/FPBits.h
+@@ -15,6 +15,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_FPBITS_H
+ #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FPBITS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/common.h"
+@@ -26,8 +27,6 @@
+ #include "src/__support/sign.h"                    // Sign
+ #include "src/__support/uint128.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+ 
+diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
+index a2f285fc6fb9..b30e36fd03e9 100644
+--- a/libc/src/__support/FPUtil/NormalFloat.h
++++ b/libc/src/__support/FPUtil/NormalFloat.h
+@@ -11,12 +11,11 @@
+ 
+ #include "FPBits.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+ 
+diff --git a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+index 914155a01631..18182ed977cc 100644
+--- a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
++++ b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+@@ -18,9 +18,9 @@
+ #endif
+ 
+ #include <arm_acle.h>
+-#include <stdint.h>
+ 
+ #include "hdr/fenv_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/fenv_t.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ 
+diff --git a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
+index dcce76b6116b..a2066d1025f6 100644
+--- a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
++++ b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
+@@ -18,9 +18,9 @@
+ #endif
+ 
+ #include <arm_acle.h>
+-#include <stdint.h>
+ 
+ #include "hdr/fenv_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/fenv_t.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ 
+diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
+index aaf37c0a045a..64ab4a9189b7 100644
+--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
++++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
+@@ -10,11 +10,11 @@
+ #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_ARM_FENVIMPL_H
+ 
+ #include "hdr/fenv_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/fenv_t.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/macros/attributes.h" // For LIBC_INLINE
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+diff --git a/libc/src/__support/FPUtil/bfloat16.h b/libc/src/__support/FPUtil/bfloat16.h
+index bc0b8b23896f..627f4759f9d1 100644
+--- a/libc/src/__support/FPUtil/bfloat16.h
++++ b/libc/src/__support/FPUtil/bfloat16.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_FPUTIL_BFLOAT16_H
+ #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_BFLOAT16_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/FPUtil/cast.h"
+@@ -16,8 +17,6 @@
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/types.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+ 
+diff --git a/libc/src/__support/FPUtil/riscv/FEnvImpl.h b/libc/src/__support/FPUtil/riscv/FEnvImpl.h
+index 2f525eb5dac5..cb2d2d59fb86 100644
+--- a/libc/src/__support/FPUtil/riscv/FEnvImpl.h
++++ b/libc/src/__support/FPUtil/riscv/FEnvImpl.h
+@@ -10,13 +10,12 @@
+ #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_RISCV_FENVIMPL_H
+ 
+ #include "hdr/fenv_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/fenv_t.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/macros/attributes.h" // For LIBC_INLINE_ASM
+ #include "src/__support/macros/config.h"     // For LIBC_INLINE
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+ 
+diff --git a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
+index 560727c22978..5da509796d84 100644
+--- a/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
++++ b/libc/src/__support/FPUtil/x86_64/FEnvImpl.h
+@@ -17,8 +17,7 @@
+ #error "Invalid include"
+ #endif
+ 
+-#include <stdint.h>
+-
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/fenv_t.h"
+ #include "src/__support/macros/sanitizer.h"
+ 
+diff --git a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
+index 2e6b297ae934..74a991d40115 100644
+--- a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
++++ b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
+@@ -16,12 +16,11 @@
+ #error "Invalid include"
+ #endif
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/FPUtil/FEnvImpl.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+ 
+diff --git a/libc/src/__support/File/CMakeLists.txt b/libc/src/__support/File/CMakeLists.txt
+index 5a4af5e70e28..253243ff40de 100644
+--- a/libc/src/__support/File/CMakeLists.txt
++++ b/libc/src/__support/File/CMakeLists.txt
+@@ -12,13 +12,14 @@ add_object_library(
+   HDRS
+     file.h
+   DEPENDS
++    libc.hdr.stdio_macros
++    libc.hdr.stdint_proxy
++    libc.hdr.func.realloc
++    libc.hdr.types.off_t
+     libc.src.__support.CPP.new
+     libc.src.__support.CPP.span
+     libc.src.__support.threads.mutex
+     libc.src.__support.error_or
+-    libc.hdr.types.off_t
+-    libc.hdr.stdio_macros
+-    libc.hdr.func.realloc
+ )
+ 
+ add_object_library(
+diff --git a/libc/src/__support/File/file.h b/libc/src/__support/File/file.h
+index 5c97a9c6419f..3652e448c7f5 100644
+--- a/libc/src/__support/File/file.h
++++ b/libc/src/__support/File/file.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_FILE_FILE_H
+ #define LLVM_LIBC_SRC___SUPPORT_FILE_FILE_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "hdr/stdio_macros.h"
+ #include "hdr/types/off_t.h"
+ #include "src/__support/CPP/new.h"
+@@ -18,7 +19,6 @@
+ #include "src/__support/threads/mutex.h"
+ 
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/__support/File/linux/CMakeLists.txt b/libc/src/__support/File/linux/CMakeLists.txt
+index 84e3d5608361..7a7bd6cdb83a 100644
+--- a/libc/src/__support/File/linux/CMakeLists.txt
++++ b/libc/src/__support/File/linux/CMakeLists.txt
+@@ -8,16 +8,17 @@ add_object_library(
+     lseekImpl.h
+   DEPENDS
+     libc.hdr.fcntl_macros
++    libc.hdr.stdio_macros
++    libc.hdr.stdint_proxy
++    libc.hdr.types.off_t
++    libc.hdr.types.FILE
+     libc.include.sys_syscall
+     libc.include.sys_stat
+     libc.src.__support.CPP.new
+     libc.src.__support.OSUtil.osutil
+-    libc.src.errno.errno
+     libc.src.__support.error_or
+     libc.src.__support.File.file
+-    libc.hdr.types.off_t
+-    libc.hdr.types.FILE
+-    libc.hdr.stdio_macros
++    libc.src.errno.errno
+ )
+ 
+ add_object_library(
+diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h
+index 300e5c5dd55b..c22a6c59b65f 100644
+--- a/libc/src/__support/File/linux/lseekImpl.h
++++ b/libc/src/__support/File/linux/lseekImpl.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_FILE_LINUX_LSEEKIMPL_H
+ #define LLVM_LIBC_SRC___SUPPORT_FILE_LINUX_LSEEKIMPL_H
+ 
++#include "hdr/stdint_proxy.h" // For uint64_t.
+ #include "hdr/types/off_t.h"
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/common.h"
+@@ -16,7 +17,6 @@
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>      // For uint64_t.
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/__support/GPU/CMakeLists.txt b/libc/src/__support/GPU/CMakeLists.txt
+index 4ffee011be96..f8fdfeb9da9d 100644
+--- a/libc/src/__support/GPU/CMakeLists.txt
++++ b/libc/src/__support/GPU/CMakeLists.txt
+@@ -16,6 +16,7 @@ add_object_library(
+   HDRS
+     allocator.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.RPC.rpc_client
+     libc.src.__support.CPP.atomic
+diff --git a/libc/src/__support/GPU/allocator.h b/libc/src/__support/GPU/allocator.h
+index a7cf8bceef27..67a7ff531fcd 100644
+--- a/libc/src/__support/GPU/allocator.h
++++ b/libc/src/__support/GPU/allocator.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_GPU_ALLOCATOR_H
+ #define LLVM_LIBC_SRC___SUPPORT_GPU_ALLOCATOR_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace gpu {
+diff --git a/libc/src/__support/HashTable/CMakeLists.txt b/libc/src/__support/HashTable/CMakeLists.txt
+index 698b8d0dfa68..d5861e7e825c 100644
+--- a/libc/src/__support/HashTable/CMakeLists.txt
++++ b/libc/src/__support/HashTable/CMakeLists.txt
+@@ -5,6 +5,7 @@ add_header_library(
+   FLAGS
+     EXPLICIT_SIMD_OPT
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.CPP.bit
+     libc.src.__support.macros.properties.cpu_features
+@@ -26,6 +27,7 @@ add_header_library(
+     table.h
+   DEPENDS
+     .bitmask
++    libc.hdr.stdint_proxy
+     libc.hdr.types.ENTRY
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.new
+diff --git a/libc/src/__support/HashTable/bitmask.h b/libc/src/__support/HashTable/bitmask.h
+index 3cac481b8060..a4af496f83ee 100644
+--- a/libc/src/__support/HashTable/bitmask.h
++++ b/libc/src/__support/HashTable/bitmask.h
+@@ -9,11 +9,11 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_BITMASK_H
+ #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_BITMASK_H
+ 
++#include "hdr/stdint_proxy.h" // uint8_t, uint64_t
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/cpu_features.h"
+ #include <stddef.h> // size_t
+-#include <stdint.h> // uint8_t, uint64_t
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
+index 10dd9711afbf..966ee0fcaf0a 100644
+--- a/libc/src/__support/HashTable/table.h
++++ b/libc/src/__support/HashTable/table.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H
+ #define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_TABLE_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/ENTRY.h"
+ #include "src/__support/CPP/bit.h" // bit_ceil
+ #include "src/__support/CPP/new.h"
+@@ -21,7 +22,6 @@
+ #include "src/string/memory_utils/inline_strcmp.h"
+ #include "src/string/string_utils.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/src/__support/arg_list.h b/libc/src/__support/arg_list.h
+index 66afa67e320b..1e26a5e8ef9c 100644
+--- a/libc/src/__support/arg_list.h
++++ b/libc/src/__support/arg_list.h
+@@ -9,12 +9,12 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_ARG_LIST_H
+ #define LLVM_LIBC_SRC___SUPPORT_ARG_LIST_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ 
+ #include <stdarg.h>
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
+index 85db31d01399..10e35ef4fd24 100644
+--- a/libc/src/__support/big_int.h
++++ b/libc/src/__support/big_int.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_BIG_INT_H
+ #define LLVM_LIBC_SRC___SUPPORT_BIG_INT_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/bit.h" // countl_zero
+ #include "src/__support/CPP/limits.h"
+@@ -23,7 +24,6 @@
+ #include "src/__support/number_pair.h"
+ 
+ #include <stddef.h> // For size_t
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/__support/block.h b/libc/src/__support/block.h
+index a58c38bbb7ac..b0d657609324 100644
+--- a/libc/src/__support/block.h
++++ b/libc/src/__support/block.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_BLOCK_H
+ #define LLVM_LIBC_SRC___SUPPORT_BLOCK_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/algorithm.h"
+ #include "src/__support/CPP/cstddef.h"
+ #include "src/__support/CPP/limits.h"
+@@ -20,8 +21,6 @@
+ #include "src/__support/macros/config.h"
+ #include "src/__support/math_extras.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ /// Returns the value rounded down to the nearest multiple of alignment.
+diff --git a/libc/src/__support/blockstore.h b/libc/src/__support/blockstore.h
+index efe2234eace5..61ee0ee80a2b 100644
+--- a/libc/src/__support/blockstore.h
++++ b/libc/src/__support/blockstore.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_BLOCKSTORE_H
+ #define LLVM_LIBC_SRC___SUPPORT_BLOCKSTORE_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/new.h"
+ #include "src/__support/CPP/type_traits.h"
+@@ -16,7 +17,6 @@
+ #include "src/__support/macros/config.h"
+ 
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/__support/detailed_powers_of_ten.h b/libc/src/__support/detailed_powers_of_ten.h
+index 28741b8a3f56..ab6f2beb6735 100644
+--- a/libc/src/__support/detailed_powers_of_ten.h
++++ b/libc/src/__support/detailed_powers_of_ten.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_DETAILED_POWERS_OF_TEN_H
+ #define LLVM_LIBC_SRC___SUPPORT_DETAILED_POWERS_OF_TEN_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+ 
+diff --git a/libc/src/__support/endian_internal.h b/libc/src/__support/endian_internal.h
+index 77839ad75455..c78090ad85e0 100644
+--- a/libc/src/__support/endian_internal.h
++++ b/libc/src/__support/endian_internal.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_ENDIAN_INTERNAL_H
+ #define LLVM_LIBC_SRC___SUPPORT_ENDIAN_INTERNAL_H
+ 
+-#include "common.h"
++#include "hdr/stdint_proxy.h"
++#include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // We rely on compiler preprocessor defines to allow for cross compilation.
+diff --git a/libc/src/__support/fixed_point/CMakeLists.txt b/libc/src/__support/fixed_point/CMakeLists.txt
+index 235c03048cfa..145eedb6ae8e 100644
+--- a/libc/src/__support/fixed_point/CMakeLists.txt
++++ b/libc/src/__support/fixed_point/CMakeLists.txt
+@@ -3,6 +3,7 @@ add_header_library(
+   HDRS
+     fx_rep.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.llvm-libc-macros.stdfix_macros
+     libc.src.__support.macros.attributes
+     libc.src.__support.CPP.type_traits
+diff --git a/libc/src/__support/fixed_point/fx_rep.h b/libc/src/__support/fixed_point/fx_rep.h
+index 7227fffa683a..e68be57dedf2 100644
+--- a/libc/src/__support/fixed_point/fx_rep.h
++++ b/libc/src/__support/fixed_point/fx_rep.h
+@@ -9,13 +9,12 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_FIXED_POINT_FX_REP_H
+ #define LLVM_LIBC_SRC___SUPPORT_FIXED_POINT_FX_REP_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "include/llvm-libc-macros/stdfix-macros.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/macros/attributes.h" // LIBC_INLINE, LIBC_INLINE_VAR
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ #ifdef LIBC_COMPILER_HAS_FIXED_POINT
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
+index d88bf844b15a..cab146a5b869 100644
+--- a/libc/src/__support/float_to_string.h
++++ b/libc/src/__support/float_to_string.h
+@@ -9,8 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_FLOAT_TO_STRING_H
+ #define LLVM_LIBC_SRC___SUPPORT_FLOAT_TO_STRING_H
+ 
+-#include <stdint.h>
+-
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/limits.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/FPUtil/FPBits.h"
+diff --git a/libc/src/__support/hash.h b/libc/src/__support/hash.h
+index 49138b1f43b9..6dfaadf08715 100644
+--- a/libc/src/__support/hash.h
++++ b/libc/src/__support/hash.h
+@@ -9,12 +9,12 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_HASH_H
+ #define LLVM_LIBC_SRC___SUPPORT_HASH_H
+ 
++#include "hdr/stdint_proxy.h"                // For uint64_t
+ #include "src/__support/CPP/bit.h"           // rotl
+ #include "src/__support/CPP/limits.h"        // numeric_limits
+ #include "src/__support/macros/attributes.h" // LIBC_INLINE
+ #include "src/__support/macros/config.h"
+ #include "src/__support/uint128.h" // UInt128
+-#include <stdint.h>                // For uint64_t
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/src/__support/high_precision_decimal.h b/libc/src/__support/high_precision_decimal.h
+index cb4b50c31544..08af78602d2a 100644
+--- a/libc/src/__support/high_precision_decimal.h
++++ b/libc/src/__support/high_precision_decimal.h
+@@ -15,11 +15,11 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_HIGH_PRECISION_DECIMAL_H
+ #define LLVM_LIBC_SRC___SUPPORT_HIGH_PRECISION_DECIMAL_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/limits.h"
+ #include "src/__support/ctype_utils.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/str_to_integer.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/src/__support/integer_literals.h b/libc/src/__support/integer_literals.h
+index f68b7ef12c87..67d241d93b28 100644
+--- a/libc/src/__support/integer_literals.h
++++ b/libc/src/__support/integer_literals.h
+@@ -13,13 +13,13 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
+ #define LLVM_LIBC_SRC___SUPPORT_INTEGER_LITERALS_H
+ 
++#include "hdr/stdint_proxy.h"         // uintxx_t
+ #include "src/__support/CPP/limits.h" // CHAR_BIT
+ #include "src/__support/ctype_utils.h"
+ #include "src/__support/macros/attributes.h" // LIBC_INLINE
+ #include "src/__support/macros/config.h"
+ #include "src/__support/uint128.h" // UInt128
+ #include <stddef.h>                // size_t
+-#include <stdint.h>                // uintxx_t
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
+index 65bdcf16b386..29449bd73973 100644
+--- a/libc/src/__support/integer_to_string.h
++++ b/libc/src/__support/integer_to_string.h
+@@ -57,8 +57,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_INTEGER_TO_STRING_H
+ #define LLVM_LIBC_SRC___SUPPORT_INTEGER_TO_STRING_H
+ 
+-#include <stdint.h>
+-
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/algorithm.h" // max
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/bit.h"
+diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
+index 80ed63a2fbcf..dfa2f9c57249 100644
+--- a/libc/src/__support/macros/properties/CMakeLists.txt
++++ b/libc/src/__support/macros/properties/CMakeLists.txt
+@@ -34,6 +34,7 @@ add_header_library(
+     .cpu_features
+     .os
+     libc.hdr.float_macros
++    libc.hdr.stdint_proxy
+     libc.include.llvm-libc-macros.float16_macros
+     libc.include.llvm-libc-types.float128
+ )
+diff --git a/libc/src/__support/macros/properties/types.h b/libc/src/__support/macros/properties/types.h
+index aec4a488c3cb..3259c8a6a1d1 100644
+--- a/libc/src/__support/macros/properties/types.h
++++ b/libc/src/__support/macros/properties/types.h
+@@ -10,7 +10,8 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
+ #define LLVM_LIBC_SRC___SUPPORT_MACROS_PROPERTIES_TYPES_H
+ 
+-#include "hdr/float_macros.h"                        // LDBL_MANT_DIG
++#include "hdr/float_macros.h" // LDBL_MANT_DIG
++#include "hdr/stdint_proxy.h" // UINT64_MAX, __SIZEOF_INT128__
+ #include "include/llvm-libc-macros/float16-macros.h" // LIBC_TYPES_HAS_FLOAT16
+ #include "include/llvm-libc-types/float128.h"        // float128
+ #include "src/__support/macros/config.h"             // LIBC_NAMESPACE_DECL
+@@ -19,8 +20,6 @@
+ #include "src/__support/macros/properties/cpu_features.h"
+ #include "src/__support/macros/properties/os.h"
+ 
+-#include <stdint.h> // UINT64_MAX, __SIZEOF_INT128__
+-
+ // 'long double' properties.
+ #if (LDBL_MANT_DIG == 53)
+ #define LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64
+diff --git a/libc/src/__support/ryu_constants.h b/libc/src/__support/ryu_constants.h
+index 1ecb34e661ba..09bc2355ef30 100644
+--- a/libc/src/__support/ryu_constants.h
++++ b/libc/src/__support/ryu_constants.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_RYU_CONSTANTS_H
+ #define LLVM_LIBC_SRC___SUPPORT_RYU_CONSTANTS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ constexpr size_t TABLE_SHIFT_CONST = 120;
+ constexpr size_t IDX_SIZE = 16;
+diff --git a/libc/src/__support/ryu_long_double_constants.h b/libc/src/__support/ryu_long_double_constants.h
+index 8ebb29727789..487fd4a411f7 100644
+--- a/libc/src/__support/ryu_long_double_constants.h
++++ b/libc/src/__support/ryu_long_double_constants.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_RYU_LONG_DOUBLE_CONSTANTS_H
+ #define LLVM_LIBC_SRC___SUPPORT_RYU_LONG_DOUBLE_CONSTANTS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ constexpr size_t TABLE_SHIFT_CONST = 120;
+ constexpr size_t IDX_SIZE = 128;
+diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
+index a7dd7ce0ae25..3d35d8a30aff 100644
+--- a/libc/src/__support/str_to_float.h
++++ b/libc/src/__support/str_to_float.h
+@@ -16,6 +16,7 @@
+ #define LLVM_LIBC_SRC___SUPPORT_STR_TO_FLOAT_H
+ 
+ #include "hdr/errno_macros.h" // For ERANGE
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/CPP/limits.h"
+ #include "src/__support/CPP/optional.h"
+@@ -33,8 +34,6 @@
+ #include "src/__support/str_to_num_result.h"
+ #include "src/__support/uint128.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+ 
+diff --git a/libc/src/__support/threads/CMakeLists.txt b/libc/src/__support/threads/CMakeLists.txt
+index bd49bbb5ad2f..b0843463e532 100644
+--- a/libc/src/__support/threads/CMakeLists.txt
++++ b/libc/src/__support/threads/CMakeLists.txt
+@@ -49,6 +49,7 @@ add_header_library(
+   HDRS
+     thread.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.CPP.atomic
+     libc.src.__support.CPP.optional
+@@ -64,6 +65,7 @@ if(TARGET libc.src.__support.threads.${LIBC_TARGET_OS}.thread)
+     DEPENDS
+       .mutex
+       .${LIBC_TARGET_OS}.thread
++      libc.hdr.stdint_proxy
+       libc.src.__support.common
+       libc.src.__support.fixedvector
+       libc.src.__support.CPP.array
+diff --git a/libc/src/__support/threads/CndVar.h b/libc/src/__support/threads/CndVar.h
+index e42fa145faea..7b2a7126ca09 100644
+--- a/libc/src/__support/threads/CndVar.h
++++ b/libc/src/__support/threads/CndVar.h
+@@ -9,13 +9,12 @@
+ #ifndef LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_CNDVAR_H
+ #define LLVM_LIBC___SUPPORT_SRC_THREADS_LINUX_CNDVAR_H
+ 
++#include "hdr/stdint_proxy.h" // uint32_t
+ #include "src/__support/macros/config.h"
+ #include "src/__support/threads/linux/futex_utils.h" // Futex
+ #include "src/__support/threads/linux/raw_mutex.h"   // RawMutex
+ #include "src/__support/threads/mutex.h"             // Mutex
+ 
+-#include <stdint.h> // uint32_t
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ class CndVar {
+diff --git a/libc/src/__support/threads/linux/CMakeLists.txt b/libc/src/__support/threads/linux/CMakeLists.txt
+index 364e7e2b9058..cbb788645a13 100644
+--- a/libc/src/__support/threads/linux/CMakeLists.txt
++++ b/libc/src/__support/threads/linux/CMakeLists.txt
+@@ -2,6 +2,8 @@ add_header_library(
+   futex_word_type
+   HDRS
+     futex_word.h
++  DEPENDS
++    libc.hdr.stdint_proxy
+ )
+ 
+ if(NOT TARGET libc.src.__support.OSUtil.osutil)
+@@ -114,6 +116,7 @@ add_object_library(
+   HDRS
+     ../CndVar.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+     libc.src.__support.threads.linux.futex_word_type
+diff --git a/libc/src/__support/threads/linux/futex_word.h b/libc/src/__support/threads/linux/futex_word.h
+index a5a6a0c110ca..1cf7befea5c8 100644
+--- a/libc/src/__support/threads/linux/futex_word.h
++++ b/libc/src/__support/threads/linux/futex_word.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_FUTEX_WORD_H
+ #define LLVM_LIBC_SRC___SUPPORT_THREADS_LINUX_FUTEX_WORD_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ #include <sys/syscall.h>
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp
+index baad26aed685..d9e479eefcae 100644
+--- a/libc/src/__support/threads/linux/thread.cpp
++++ b/libc/src/__support/threads/linux/thread.cpp
+@@ -23,10 +23,10 @@
+ #endif
+ 
+ #include "hdr/fcntl_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include <linux/param.h> // For EXEC_PAGESIZE.
+ #include <linux/prctl.h> // For PR_SET_NAME
+ #include <linux/sched.h> // For CLONE_* flags.
+-#include <stdint.h>
+ #include <sys/mman.h>    // For PROT_* and MAP_* definitions.
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+diff --git a/libc/src/__support/threads/thread.h b/libc/src/__support/threads/thread.h
+index f2b1f6bbb253..114ab4932af7 100644
+--- a/libc/src/__support/threads/thread.h
++++ b/libc/src/__support/threads/thread.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_THREADS_THREAD_H
+ #define LLVM_LIBC_SRC___SUPPORT_THREADS_THREAD_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/atomic.h"
+ #include "src/__support/CPP/optional.h"
+ #include "src/__support/CPP/string_view.h"
+@@ -21,7 +22,6 @@
+ #include <linux/param.h> // for exec_pagesize.
+ 
+ #include <stddef.h> // For size_t
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt
+index 802441d37fe9..cf3e641d2d2d 100644
+--- a/libc/src/__support/wchar/CMakeLists.txt
++++ b/libc/src/__support/wchar/CMakeLists.txt
+@@ -3,7 +3,8 @@ add_header_library(
+   HDRS
+     mbstate.h
+   DEPENDS
+-    libc.hdr.types.char32_t    
++    libc.hdr.stdint_proxy
++    libc.hdr.types.char32_t
+ )
+ 
+ add_header_library(
+diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
+index 32304a521524..96256cae14cb 100644
+--- a/libc/src/__support/wchar/mbstate.h
++++ b/libc/src/__support/wchar/mbstate.h
+@@ -9,9 +9,9 @@
+ #ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
+ #define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/char32_t.h"
+ #include "src/__support/common.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/src/arpa/inet/CMakeLists.txt b/libc/src/arpa/inet/CMakeLists.txt
+index cdd53e915034..1f39a076fde9 100644
+--- a/libc/src/arpa/inet/CMakeLists.txt
++++ b/libc/src/arpa/inet/CMakeLists.txt
+@@ -5,6 +5,7 @@ add_entrypoint_object(
+   HDRS
+     htonl.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.arpa_inet
+     libc.src.__support.common
+ )
+@@ -16,6 +17,7 @@ add_entrypoint_object(
+   HDRS
+     htons.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.arpa_inet
+     libc.src.__support.common
+ )
+@@ -27,6 +29,7 @@ add_entrypoint_object(
+   HDRS
+     ntohl.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.arpa_inet
+     libc.src.__support.common
+ )
+@@ -38,6 +41,7 @@ add_entrypoint_object(
+   HDRS
+     ntohs.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.arpa_inet
+     libc.src.__support.common
+ )
+diff --git a/libc/src/arpa/inet/htonl.h b/libc/src/arpa/inet/htonl.h
+index e444972c771e..34f017df7334 100644
+--- a/libc/src/arpa/inet/htonl.h
++++ b/libc/src/arpa/inet/htonl.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC_ARPA_INET_HTONL_H
+ #define LLVM_LIBC_SRC_ARPA_INET_HTONL_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/arpa/inet/htons.h b/libc/src/arpa/inet/htons.h
+index 35c2acdcfae4..5e283cd49241 100644
+--- a/libc/src/arpa/inet/htons.h
++++ b/libc/src/arpa/inet/htons.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC_ARPA_INET_HTONS_H
+ #define LLVM_LIBC_SRC_ARPA_INET_HTONS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/arpa/inet/ntohl.h b/libc/src/arpa/inet/ntohl.h
+index 40079650d6fd..76ba02ba00dd 100644
+--- a/libc/src/arpa/inet/ntohl.h
++++ b/libc/src/arpa/inet/ntohl.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC_ARPA_INET_NTOHL_H
+ #define LLVM_LIBC_SRC_ARPA_INET_NTOHL_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/arpa/inet/ntohs.h b/libc/src/arpa/inet/ntohs.h
+index 5fe3ebcb6252..b98c85264923 100644
+--- a/libc/src/arpa/inet/ntohs.h
++++ b/libc/src/arpa/inet/ntohs.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC_ARPA_INET_NTOHS_H
+ #define LLVM_LIBC_SRC_ARPA_INET_NTOHS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/compiler/generic/CMakeLists.txt b/libc/src/compiler/generic/CMakeLists.txt
+index 2fc8f7f64c85..d9ad42e465e4 100644
+--- a/libc/src/compiler/generic/CMakeLists.txt
++++ b/libc/src/compiler/generic/CMakeLists.txt
+@@ -5,6 +5,7 @@ add_entrypoint_object(
+   HDRS
+     ../__stack_chk_fail.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.OSUtil.osutil
+     libc.src.stdlib.abort
+ )
+diff --git a/libc/src/compiler/generic/__stack_chk_fail.cpp b/libc/src/compiler/generic/__stack_chk_fail.cpp
+index 00e976ad8bc2..7edd16cf6a86 100644
+--- a/libc/src/compiler/generic/__stack_chk_fail.cpp
++++ b/libc/src/compiler/generic/__stack_chk_fail.cpp
+@@ -7,9 +7,9 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/compiler/__stack_chk_fail.h"
++#include "hdr/stdint_proxy.h" // For uintptr_t
+ #include "src/__support/OSUtil/io.h"
+ #include "src/stdlib/abort.h"
+-#include <stdint.h> // For uintptr_t
+ 
+ extern "C" {
+ 
+diff --git a/libc/src/inttypes/CMakeLists.txt b/libc/src/inttypes/CMakeLists.txt
+index c3111ed80ff8..3a48c9a1a8b9 100644
+--- a/libc/src/inttypes/CMakeLists.txt
++++ b/libc/src/inttypes/CMakeLists.txt
+@@ -5,6 +5,7 @@ add_entrypoint_object(
+   HDRS
+     strtoimax.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.str_to_integer
+     libc.src.errno.errno
+ )
+@@ -16,6 +17,7 @@ add_entrypoint_object(
+   HDRS
+     strtoumax.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.str_to_integer
+     libc.src.errno.errno
+ )
+diff --git a/libc/src/inttypes/strtoimax.h b/libc/src/inttypes/strtoimax.h
+index 804d07c219aa..fedc458bac1d 100644
+--- a/libc/src/inttypes/strtoimax.h
++++ b/libc/src/inttypes/strtoimax.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC_INTTYPES_STRTOIMAX_H
+ #define LLVM_LIBC_SRC_INTTYPES_STRTOIMAX_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/inttypes/strtoumax.h b/libc/src/inttypes/strtoumax.h
+index 4c53c0362b9e..d5b82afa3109 100644
+--- a/libc/src/inttypes/strtoumax.h
++++ b/libc/src/inttypes/strtoumax.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_SRC_INTTYPES_STRTOUMAX_H
+ #define LLVM_LIBC_SRC_INTTYPES_STRTOUMAX_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/link/CMakeLists.txt b/libc/src/link/CMakeLists.txt
+index f68950e625dc..55f5edfab7d9 100644
+--- a/libc/src/link/CMakeLists.txt
++++ b/libc/src/link/CMakeLists.txt
+@@ -4,4 +4,6 @@ add_entrypoint_object(
+     dl_iterate_phdr.cpp
+   HDRS
+     dl_iterate_phdr.h
++  DEPENDS
++    libc.hdr.stdint_proxy
+ )
+diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
+index a9237741788a..788e9ede4111 100644
+--- a/libc/src/math/generic/CMakeLists.txt
++++ b/libc/src/math/generic/CMakeLists.txt
+@@ -5075,6 +5075,7 @@ add_header_library(
+   HDRS
+     expxf16.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.FPUtil.cast
+     libc.src.__support.FPUtil.fp_bits
+     libc.src.__support.FPUtil.multiply_add
+diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
+index 05ac95d58682..5f25983bec9a 100644
+--- a/libc/src/math/generic/expxf16.h
++++ b/libc/src/math/generic/expxf16.h
+@@ -9,14 +9,13 @@
+ #ifndef LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
+ #define LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/FPUtil/cast.h"
+ #include "src/__support/FPUtil/multiply_add.h"
+ #include "src/__support/FPUtil/nearest_integer.h"
+ #include "src/__support/macros/attributes.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+-
+ #include "src/__support/math/expf16_utils.h"
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/pthread/CMakeLists.txt b/libc/src/pthread/CMakeLists.txt
+index c8c66805667f..c5db6fa910db 100644
+--- a/libc/src/pthread/CMakeLists.txt
++++ b/libc/src/pthread/CMakeLists.txt
+@@ -99,6 +99,7 @@ add_entrypoint_object(
+   HDRS
+     pthread_attr_setstack.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.pthread
+     libc.src.pthread.pthread_attr_setstacksize
+     libc.src.errno.errno
+diff --git a/libc/src/pthread/pthread_attr_setstack.cpp b/libc/src/pthread/pthread_attr_setstack.cpp
+index 767f959b1400..b66072cbcf05 100644
+--- a/libc/src/pthread/pthread_attr_setstack.cpp
++++ b/libc/src/pthread/pthread_attr_setstack.cpp
+@@ -9,13 +9,13 @@
+ #include "pthread_attr_setstack.h"
+ #include "pthread_attr_setstacksize.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/threads/thread.h" // For STACK_ALIGNMENT
+ 
+ #include <pthread.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/sched/linux/CMakeLists.txt b/libc/src/sched/linux/CMakeLists.txt
+index 4852c9053bea..e690e76fd748 100644
+--- a/libc/src/sched/linux/CMakeLists.txt
++++ b/libc/src/sched/linux/CMakeLists.txt
+@@ -5,6 +5,7 @@ add_entrypoint_object(
+   HDRS
+     ../sched_getaffinity.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.sched
+     libc.src.__support.OSUtil.osutil
+     libc.src.errno.errno
+diff --git a/libc/src/sched/linux/sched_getaffinity.cpp b/libc/src/sched/linux/sched_getaffinity.cpp
+index e005819e2a97..4a5e91af5de8 100644
+--- a/libc/src/sched/linux/sched_getaffinity.cpp
++++ b/libc/src/sched/linux/sched_getaffinity.cpp
+@@ -8,13 +8,13 @@
+ 
+ #include "src/sched/sched_getaffinity.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/common.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ 
+ #include <sched.h>
+-#include <stdint.h>
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/spawn/CMakeLists.txt b/libc/src/spawn/CMakeLists.txt
+index 11621da782ed..fe5f81effc3f 100644
+--- a/libc/src/spawn/CMakeLists.txt
++++ b/libc/src/spawn/CMakeLists.txt
+@@ -7,6 +7,7 @@ add_header_library(
+   HDRS
+     file_actions.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.spawn
+ )
+ 
+diff --git a/libc/src/spawn/file_actions.h b/libc/src/spawn/file_actions.h
+index 80b92959c6a4..c69640a3ddd0 100644
+--- a/libc/src/spawn/file_actions.h
++++ b/libc/src/spawn/file_actions.h
+@@ -9,9 +9,9 @@
+ #ifndef LLVM_LIBC_SRC_SPAWN_FILE_ACTIONS_H
+ #define LLVM_LIBC_SRC_SPAWN_FILE_ACTIONS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+ #include <spawn.h> // For mode_t
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt
+index bea113409263..8412153bf580 100644
+--- a/libc/src/stdio/gpu/CMakeLists.txt
++++ b/libc/src/stdio/gpu/CMakeLists.txt
+@@ -259,6 +259,7 @@ add_entrypoint_object(
+   HDRS
+     ../fgets.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.hdr.types.FILE
+     .gpu_file
+ )
+diff --git a/libc/src/stdio/gpu/fgets.cpp b/libc/src/stdio/gpu/fgets.cpp
+index 5447e86d1c76..e1c6088b6008 100644
+--- a/libc/src/stdio/gpu/fgets.cpp
++++ b/libc/src/stdio/gpu/fgets.cpp
+@@ -9,12 +9,11 @@
+ #include "src/stdio/fgets.h"
+ 
+ #include "file.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/stdio_macros.h" // for EOF.
+ #include "hdr/types/FILE.h"
+ #include "src/__support/common.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ LLVM_LIBC_FUNCTION(char *, fgets,
+diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
+index 74ae864f72e2..aa653c38a8c3 100644
+--- a/libc/src/stdlib/CMakeLists.txt
++++ b/libc/src/stdlib/CMakeLists.txt
+@@ -191,8 +191,9 @@ add_entrypoint_object(
+   HDRS
+     a64l.h
+   DEPENDS
+-    libc.src.__support.ctype_utils
++    libc.hdr.stdint_proxy
+     libc.hdr.types.size_t
++    libc.src.__support.ctype_utils
+ )
+ 
+ add_entrypoint_object(
+@@ -202,8 +203,9 @@ add_entrypoint_object(
+   HDRS
+     l64a.h
+   DEPENDS
+-    libc.src.__support.ctype_utils
++    libc.hdr.stdint_proxy
+     libc.hdr.types.size_t
++    libc.src.__support.ctype_utils
+ )
+ 
+ add_entrypoint_object(
+@@ -287,6 +289,7 @@ add_header_library(
+     heap_sort.h
+     quick_sort.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.stdlib
+     libc.src.__support.CPP.cstddef
+ )
+@@ -299,6 +302,7 @@ add_entrypoint_object(
+     qsort.h
+   DEPENDS
+     .qsort_util
++    libc.hdr.stdint_proxy
+     libc.hdr.types.size_t
+ )
+ 
+@@ -310,6 +314,7 @@ add_entrypoint_object(
+     qsort_r.h
+   DEPENDS
+     .qsort_util
++    libc.hdr.stdint_proxy
+     libc.hdr.types.size_t
+ )
+ 
+diff --git a/libc/src/stdlib/a64l.cpp b/libc/src/stdlib/a64l.cpp
+index 84be2d208f7d..690b70d65677 100644
+--- a/libc/src/stdlib/a64l.cpp
++++ b/libc/src/stdlib/a64l.cpp
+@@ -7,13 +7,12 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/stdlib/a64l.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/size_t.h"
+ #include "src/__support/common.h"
+ #include "src/__support/ctype_utils.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // I'm not sure this should go in ctype_utils since the specific ordering of
+diff --git a/libc/src/stdlib/bsearch.cpp b/libc/src/stdlib/bsearch.cpp
+index 69b3e749c03f..f0848051173e 100644
+--- a/libc/src/stdlib/bsearch.cpp
++++ b/libc/src/stdlib/bsearch.cpp
+@@ -10,7 +10,7 @@
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/stdlib/l64a.cpp b/libc/src/stdlib/l64a.cpp
+index b5506c3e40c3..d59e65e7dc4c 100644
+--- a/libc/src/stdlib/l64a.cpp
++++ b/libc/src/stdlib/l64a.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/stdlib/l64a.h"
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/size_t.h"
+ #include "src/__support/common.h"
+ #include "src/__support/ctype_utils.h"
+ #include "src/__support/libc_assert.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ // the standard says to only use up to 6 characters. Null terminator is
+diff --git a/libc/src/stdlib/qsort.cpp b/libc/src/stdlib/qsort.cpp
+index 0bf5fc798052..f66b686d4e54 100644
+--- a/libc/src/stdlib/qsort.cpp
++++ b/libc/src/stdlib/qsort.cpp
+@@ -7,12 +7,11 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/stdlib/qsort.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ #include "src/stdlib/qsort_util.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ LLVM_LIBC_FUNCTION(void, qsort,
+diff --git a/libc/src/stdlib/qsort_data.h b/libc/src/stdlib/qsort_data.h
+index aa6d9bbc123d..739fce88ab75 100644
+--- a/libc/src/stdlib/qsort_data.h
++++ b/libc/src/stdlib/qsort_data.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_SRC_STDLIB_QSORT_DATA_H
+ #define LLVM_LIBC_SRC_STDLIB_QSORT_DATA_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/cstddef.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+ 
+diff --git a/libc/src/stdlib/qsort_r.cpp b/libc/src/stdlib/qsort_r.cpp
+index 4e60998b6a6d..47448201eddb 100644
+--- a/libc/src/stdlib/qsort_r.cpp
++++ b/libc/src/stdlib/qsort_r.cpp
+@@ -7,12 +7,11 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/stdlib/qsort_r.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ #include "src/stdlib/qsort_util.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ 
+ LLVM_LIBC_FUNCTION(void, qsort_r,
+diff --git a/libc/src/stdlib/quick_sort.h b/libc/src/stdlib/quick_sort.h
+index 8ba0098854d1..00115bdfbae7 100644
+--- a/libc/src/stdlib/quick_sort.h
++++ b/libc/src/stdlib/quick_sort.h
+@@ -9,13 +9,12 @@
+ #ifndef LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
+ #define LLVM_LIBC_SRC_STDLIB_QUICK_SORT_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/CPP/cstddef.h"
+ #include "src/__support/macros/config.h"
+ #include "src/stdlib/qsort_pivot.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+ 
+diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
+index 8784bc3750cb..809decfbe5f0 100644
+--- a/libc/src/string/CMakeLists.txt
++++ b/libc/src/string/CMakeLists.txt
+@@ -17,6 +17,7 @@ add_header_library(
+   DEPENDS
+     libc.hdr.types.size_t
+     libc.hdr.limits_macros
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.bitset
+     libc.src.__support.CPP.type_traits
+     libc.src.__support.common
+diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
+index a967247db53f..4f3873b064d7 100644
+--- a/libc/src/string/memory_utils/CMakeLists.txt
++++ b/libc/src/string/memory_utils/CMakeLists.txt
+@@ -32,6 +32,7 @@ add_header_library(
+     x86_64/inline_memmove.h
+     x86_64/inline_memset.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.common
+     libc.src.__support.CPP.bit
+     libc.src.__support.CPP.cstddef
+diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
+index 9349cfdd1d02..37603410e3a5 100644
+--- a/libc/src/string/memory_utils/op_generic.h
++++ b/libc/src/string/memory_utils/op_generic.h
+@@ -23,6 +23,7 @@
+ #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/common.h"
+@@ -34,8 +35,6 @@
+ #include "src/string/memory_utils/op_builtin.h"
+ #include "src/string/memory_utils/utils.h"
+ 
+-#include <stdint.h>
+-
+ static_assert((UINTPTR_MAX == 4294967295U) ||
+                   (UINTPTR_MAX == 18446744073709551615UL),
+               "We currently only support 32- or 64-bit platforms");
+diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
+index c08608c87bb2..0f9c9e36a3dc 100644
+--- a/libc/src/string/memory_utils/utils.h
++++ b/libc/src/string/memory_utils/utils.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
+ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
+ 
++#include "hdr/stdint_proxy.h" // intptr_t / uintptr_t / INT32_MAX / INT32_MIN
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/CPP/cstddef.h"
+ #include "src/__support/CPP/type_traits.h"
+@@ -18,7 +19,6 @@
+ #include "src/__support/macros/properties/architectures.h"
+ 
+ #include <stddef.h> // size_t
+-#include <stdint.h> // intptr_t / uintptr_t / INT32_MAX / INT32_MIN
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+index 68f64fb1a502..bf3aa1f755ad 100644
+--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
++++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+@@ -8,6 +8,7 @@
+ #ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H
+ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMCPY_H
+ 
++#include "hdr/stdint_proxy.h"                  // SIZE_MAX
+ #include "src/__support/macros/attributes.h"   // LIBC_INLINE_VAR
+ #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+ #include "src/string/memory_utils/op_builtin.h"
+@@ -15,7 +16,6 @@
+ #include "src/string/memory_utils/utils.h"
+ 
+ #include <stddef.h> // size_t
+-#include <stdint.h> // SIZE_MAX
+ 
+ #ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+ #error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h
+index 4f56263fce8e..80e5783c7890 100644
+--- a/libc/src/string/string_utils.h
++++ b/libc/src/string/string_utils.h
+@@ -15,6 +15,7 @@
+ #define LLVM_LIBC_SRC_STRING_STRING_UTILS_H
+ 
+ #include "hdr/limits_macros.h"
++#include "hdr/stdint_proxy.h" // uintptr_t
+ #include "hdr/types/size_t.h"
+ #include "src/__support/CPP/bitset.h"
+ #include "src/__support/CPP/type_traits.h" // cpp::is_same_v
+diff --git a/libc/src/sys/stat/linux/CMakeLists.txt b/libc/src/sys/stat/linux/CMakeLists.txt
+index 9aeb14636c2c..c99872c3ad09 100644
+--- a/libc/src/sys/stat/linux/CMakeLists.txt
++++ b/libc/src/sys/stat/linux/CMakeLists.txt
+@@ -73,6 +73,7 @@ add_header_library(
+   HDRS
+     kernel_statx.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.sys_stat
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+diff --git a/libc/src/sys/stat/linux/kernel_statx.h b/libc/src/sys/stat/linux/kernel_statx.h
+index d0e223aec3e1..455ab17a2fb9 100644
+--- a/libc/src/sys/stat/linux/kernel_statx.h
++++ b/libc/src/sys/stat/linux/kernel_statx.h
+@@ -9,11 +9,11 @@
+ #ifndef LLVM_LIBC_SRC_SYS_STAT_LINUX_KERNEL_STATX_H
+ #define LLVM_LIBC_SRC_SYS_STAT_LINUX_KERNEL_STATX_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+ #include <sys/stat.h>
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+diff --git a/libc/src/time/CMakeLists.txt b/libc/src/time/CMakeLists.txt
+index 3b951df81001..304b3f247a50 100644
+--- a/libc/src/time/CMakeLists.txt
++++ b/libc/src/time/CMakeLists.txt
+@@ -7,10 +7,11 @@ add_header_library(
+   HDRS
+     time_constants.h
+   DEPENDS
++    libc.hdr.stdint_proxy
++    libc.hdr.types.time_t
+     libc.include.time
+     libc.src.__support.CPP.array
+     libc.src.__support.CPP.string_view
+-    libc.hdr.types.time_t
+ )
+ 
+ add_object_library(
+@@ -29,6 +30,7 @@ add_object_library(
+     libc.hdr.types.time_t
+     libc.hdr.types.size_t
+     libc.hdr.types.struct_tm
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_entrypoint_object(
+diff --git a/libc/src/time/linux/CMakeLists.txt b/libc/src/time/linux/CMakeLists.txt
+index 314623f9f425..a6ec7c7c0696 100644
+--- a/libc/src/time/linux/CMakeLists.txt
++++ b/libc/src/time/linux/CMakeLists.txt
+@@ -34,6 +34,7 @@ add_entrypoint_object(
+     ../nanosleep.h
+   DEPENDS
+     libc.hdr.types.struct_timespec
++    libc.hdr.stdint_proxy
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+     libc.src.__support.CPP.limits
+diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp
+index 6b9704126a0a..e5df1585df98 100644
+--- a/libc/src/time/linux/nanosleep.cpp
++++ b/libc/src/time/linux/nanosleep.cpp
+@@ -7,13 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/time/nanosleep.h"
++#include "hdr/stdint_proxy.h" // For int64_t.
+ #include "hdr/time_macros.h"
+ #include "src/__support/OSUtil/syscall.h" // For syscall functions.
+ #include "src/__support/common.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>      // For int64_t.
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/time/strftime_core/CMakeLists.txt b/libc/src/time/strftime_core/CMakeLists.txt
+index 5e40e662ac79..3ffd283ead7f 100644
+--- a/libc/src/time/strftime_core/CMakeLists.txt
++++ b/libc/src/time/strftime_core/CMakeLists.txt
+@@ -5,6 +5,7 @@ add_header_library(
+   DEPENDS
+     libc.src.__support.CPP.string_view
+     libc.hdr.types.struct_tm
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_header_library(
+diff --git a/libc/src/time/strftime_core/core_structs.h b/libc/src/time/strftime_core/core_structs.h
+index 25bf5e616e0e..9da57aa11e6c 100644
+--- a/libc/src/time/strftime_core/core_structs.h
++++ b/libc/src/time/strftime_core/core_structs.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_SRC_STDIO_STRFTIME_CORE_CORE_STRUCTS_H
+ #define LLVM_LIBC_SRC_STDIO_STRFTIME_CORE_CORE_STRUCTS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/struct_tm.h"
+ #include "src/__support/CPP/string_view.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace strftime_core {
+ 
+diff --git a/libc/src/time/time_constants.h b/libc/src/time/time_constants.h
+index 0fcb7ffbf13b..32eb0a171f8d 100644
+--- a/libc/src/time/time_constants.h
++++ b/libc/src/time/time_constants.h
+@@ -9,10 +9,10 @@
+ #ifndef LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+ #define LLVM_LIBC_SRC_TIME_TIME_CONSTANTS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/time_t.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/string_view.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace time_constants {
+diff --git a/libc/src/time/time_utils.cpp b/libc/src/time/time_utils.cpp
+index 1c519c3ff8ae..1d0daea6b321 100644
+--- a/libc/src/time/time_utils.cpp
++++ b/libc/src/time/time_utils.cpp
+@@ -7,13 +7,12 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "src/time/time_utils.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/limits.h" // INT_MIN, INT_MAX
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ #include "src/time/time_constants.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace time_utils {
+ 
+diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h
+index 0541c24ece82..84d412c1e846 100644
+--- a/libc/src/time/time_utils.h
++++ b/libc/src/time/time_utils.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_SRC_TIME_TIME_UTILS_H
+ #define LLVM_LIBC_SRC_TIME_TIME_UTILS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "hdr/types/size_t.h"
+ #include "hdr/types/struct_tm.h"
+ #include "hdr/types/time_t.h"
+@@ -19,8 +20,6 @@
+ #include "src/__support/macros/config.h"
+ #include "time_constants.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace time_utils {
+ 
+diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
+index 368593a3bb7b..382a61fea8b6 100644
+--- a/libc/src/unistd/linux/CMakeLists.txt
++++ b/libc/src/unistd/linux/CMakeLists.txt
+@@ -172,6 +172,7 @@ add_entrypoint_object(
+   DEPENDS
+     libc.hdr.types.off_t
+     libc.hdr.fcntl_macros
++    libc.hdr.stdint_proxy
+     libc.include.unistd
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+@@ -385,6 +386,7 @@ add_entrypoint_object(
+     libc.hdr.types.size_t
+     libc.hdr.types.ssize_t
+     libc.hdr.fcntl_macros
++    libc.hdr.stdint_proxy
+     libc.include.unistd
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+@@ -403,6 +405,7 @@ add_entrypoint_object(
+     libc.hdr.types.size_t
+     libc.hdr.types.ssize_t
+     libc.hdr.fcntl_macros
++    libc.hdr.stdint_proxy
+     libc.include.unistd
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+@@ -546,6 +549,7 @@ add_entrypoint_object(
+   DEPENDS
+     libc.hdr.types.off_t
+     libc.hdr.fcntl_macros
++    libc.hdr.stdint_proxy
+     libc.include.unistd
+     libc.include.sys_syscall
+     libc.src.__support.OSUtil.osutil
+diff --git a/libc/src/unistd/linux/ftruncate.cpp b/libc/src/unistd/linux/ftruncate.cpp
+index f6aa6f8b48cc..b4729f8d50d7 100644
+--- a/libc/src/unistd/linux/ftruncate.cpp
++++ b/libc/src/unistd/linux/ftruncate.cpp
+@@ -11,10 +11,10 @@
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/common.h"
+ 
++#include "hdr/stdint_proxy.h" // For uint64_t.
+ #include "hdr/unistd_macros.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>      // For uint64_t.
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/unistd/linux/pread.cpp b/libc/src/unistd/linux/pread.cpp
+index 2f86e397feef..cf3152dbbad8 100644
+--- a/libc/src/unistd/linux/pread.cpp
++++ b/libc/src/unistd/linux/pread.cpp
+@@ -8,12 +8,12 @@
+ 
+ #include "src/unistd/pread.h"
+ 
++#include "hdr/stdint_proxy.h"             // For uint64_t.
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/common.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON
+-#include <stdint.h>                         // For uint64_t.
+ #include <sys/syscall.h>                    // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/unistd/linux/pwrite.cpp b/libc/src/unistd/linux/pwrite.cpp
+index f4cf8e16d766..7672063dcab9 100644
+--- a/libc/src/unistd/linux/pwrite.cpp
++++ b/libc/src/unistd/linux/pwrite.cpp
+@@ -11,9 +11,9 @@
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/common.h"
+ 
++#include "hdr/stdint_proxy.h" // For uint64_t.
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>      // For uint64_t.
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/src/unistd/linux/truncate.cpp b/libc/src/unistd/linux/truncate.cpp
+index 6103d4b51350..204b27f7308d 100644
+--- a/libc/src/unistd/linux/truncate.cpp
++++ b/libc/src/unistd/linux/truncate.cpp
+@@ -13,8 +13,8 @@
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ 
++#include "hdr/stdint_proxy.h" // For uint64_t.
+ #include "hdr/unistd_macros.h"
+-#include <stdint.h>      // For uint64_t.
+ #include <sys/syscall.h> // For syscall numbers.
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/startup/baremetal/CMakeLists.txt b/libc/startup/baremetal/CMakeLists.txt
+index 4faced93fabe..276fe3387682 100644
+--- a/libc/startup/baremetal/CMakeLists.txt
++++ b/libc/startup/baremetal/CMakeLists.txt
+@@ -2,10 +2,16 @@ add_entrypoint_object(
+   init
+   SRCS
+     init.cpp
++  DEPENDS
++    libc.hdr.stdint_proxy
++    libc.src.__support.common
+ )
+ 
+ add_entrypoint_object(
+   fini
+   SRCS
+     fini.cpp
++  DEPENDS
++    libc.hdr.stdint_proxy
++    libc.src.__support.common
+ )
+diff --git a/libc/startup/baremetal/fini.cpp b/libc/startup/baremetal/fini.cpp
+index 263d7192cc60..64af842572ec 100644
+--- a/libc/startup/baremetal/fini.cpp
++++ b/libc/startup/baremetal/fini.cpp
+@@ -6,9 +6,9 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/startup/baremetal/init.cpp b/libc/startup/baremetal/init.cpp
+index ce387017c497..995609c1bf41 100644
+--- a/libc/startup/baremetal/init.cpp
++++ b/libc/startup/baremetal/init.cpp
+@@ -6,9 +6,9 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ 
+diff --git a/libc/startup/linux/CMakeLists.txt b/libc/startup/linux/CMakeLists.txt
+index eaa724e41f16..7af1819c57c1 100644
+--- a/libc/startup/linux/CMakeLists.txt
++++ b/libc/startup/linux/CMakeLists.txt
+@@ -96,6 +96,7 @@ add_object_library(
+     do_start.h
+   DEPENDS
+     libc.config.app_h
++    libc.hdr.stdint_proxy
+     libc.include.sys_mman
+     libc.include.sys_syscall
+     libc.include.llvm-libc-macros.link_macros
+diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
+index ff104c7f0d1d..94c4ec709286 100644
+--- a/libc/startup/linux/do_start.cpp
++++ b/libc/startup/linux/do_start.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ #include "startup/linux/do_start.h"
+ #include "config/linux/app.h"
++#include "hdr/stdint_proxy.h"
+ #include "include/llvm-libc-macros/link-macros.h"
+ #include "src/__support/OSUtil/syscall.h"
+ #include "src/__support/macros/config.h"
+@@ -17,7 +18,6 @@
+ 
+ #include <linux/auxvec.h>
+ #include <linux/elf.h>
+-#include <stdint.h>
+ #include <sys/mman.h>
+ #include <sys/syscall.h>
+ 
+diff --git a/libc/test/IntegrationTest/CMakeLists.txt b/libc/test/IntegrationTest/CMakeLists.txt
+index 4a999407d48d..3afe354eca98 100644
+--- a/libc/test/IntegrationTest/CMakeLists.txt
++++ b/libc/test/IntegrationTest/CMakeLists.txt
+@@ -11,6 +11,7 @@ add_object_library(
+   HDRS
+     test.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.OSUtil.osutil
+     ${arch_specific_deps}
+ )
+diff --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp
+index 871bdf0dc562..8baf74637b30 100644
+--- a/libc/test/IntegrationTest/test.cpp
++++ b/libc/test/IntegrationTest/test.cpp
+@@ -6,10 +6,10 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ #ifdef LIBC_TARGET_ARCH_IS_AARCH64
+ #include "src/sys/auxv/getauxval.h"
+diff --git a/libc/test/UnitTest/CMakeLists.txt b/libc/test/UnitTest/CMakeLists.txt
+index c32809da577d..d92ab6feccb2 100644
+--- a/libc/test/UnitTest/CMakeLists.txt
++++ b/libc/test/UnitTest/CMakeLists.txt
+@@ -69,6 +69,7 @@ add_unittest_framework_library(
+     Test.h
+     TestLogger.h
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.big_int
+     libc.src.__support.c_string
+     libc.src.__support.CPP.string
+@@ -91,12 +92,16 @@ add_unittest_framework_library(
+     ${libc_death_test_srcs}
+   HDRS
+     ExecuteFunction.h
++  DEPENDS
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_unittest_framework_library(
+   LibcHermeticTestSupport
+   SRCS
+     HermeticTestUtils.cpp
++  DEPENDS
++    libc.hdr.stdint_proxy
+ )
+ 
+ add_header_library(
+@@ -160,6 +165,7 @@ add_unittest_framework_library(
+     PrintfMatcher.h
+   DEPENDS
+     LibcTest
++    libc.hdr.stdint_proxy
+     libc.src.__support.FPUtil.fp_bits
+     libc.src.stdio.printf_core.core_structs
+     libc.test.UnitTest.string_utils
+@@ -173,6 +179,7 @@ add_unittest_framework_library(
+     ScanfMatcher.h
+   DEPENDS
+     LibcTest
++    libc.hdr.stdint_proxy
+     libc.src.__support.FPUtil.fp_bits
+     libc.src.stdio.scanf_core.core_structs
+     libc.test.UnitTest.string_utils
+diff --git a/libc/test/UnitTest/ExecuteFunction.h b/libc/test/UnitTest/ExecuteFunction.h
+index 93ab6e973380..5844fd55d4e2 100644
+--- a/libc/test/UnitTest/ExecuteFunction.h
++++ b/libc/test/UnitTest/ExecuteFunction.h
+@@ -9,9 +9,9 @@
+ #ifndef LLVM_LIBC_TEST_UNITTEST_EXECUTEFUNCTION_H
+ #define LLVM_LIBC_TEST_UNITTEST_EXECUTEFUNCTION_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/limits.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testutils {
+diff --git a/libc/test/UnitTest/HermeticTestUtils.cpp b/libc/test/UnitTest/HermeticTestUtils.cpp
+index a9494af746d5..f34a73f3e680 100644
+--- a/libc/test/UnitTest/HermeticTestUtils.cpp
++++ b/libc/test/UnitTest/HermeticTestUtils.cpp
+@@ -6,10 +6,10 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/common.h"
+ #include "src/__support/macros/config.h"
+ #include <stddef.h>
+-#include <stdint.h>
+ 
+ #ifdef LIBC_TARGET_ARCH_IS_AARCH64
+ #include "src/sys/auxv/getauxval.h"
+diff --git a/libc/test/UnitTest/PrintfMatcher.cpp b/libc/test/UnitTest/PrintfMatcher.cpp
+index 4fdcbf1746d2..2ea1bbfcc636 100644
+--- a/libc/test/UnitTest/PrintfMatcher.cpp
++++ b/libc/test/UnitTest/PrintfMatcher.cpp
+@@ -8,6 +8,7 @@
+ 
+ #include "PrintfMatcher.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/macros/config.h"
+ #include "src/stdio/printf_core/core_structs.h"
+@@ -15,8 +16,6 @@
+ #include "test/UnitTest/StringUtils.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+ 
+diff --git a/libc/test/UnitTest/RoundingModeUtils.h b/libc/test/UnitTest/RoundingModeUtils.h
+index cdc3699d1565..5f23a8708fe6 100644
+--- a/libc/test/UnitTest/RoundingModeUtils.h
++++ b/libc/test/UnitTest/RoundingModeUtils.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_TEST_UNITTEST_ROUNDINGMODEUTILS_H
+ #define LLVM_LIBC_TEST_UNITTEST_ROUNDINGMODEUTILS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace fputil {
+diff --git a/libc/test/UnitTest/ScanfMatcher.cpp b/libc/test/UnitTest/ScanfMatcher.cpp
+index 3e9f2a5f941a..d47ad71a3b6f 100644
+--- a/libc/test/UnitTest/ScanfMatcher.cpp
++++ b/libc/test/UnitTest/ScanfMatcher.cpp
+@@ -8,6 +8,7 @@
+ 
+ #include "ScanfMatcher.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/macros/config.h"
+ #include "src/stdio/scanf_core/core_structs.h"
+@@ -15,8 +16,6 @@
+ #include "test/UnitTest/StringUtils.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+ 
+diff --git a/libc/test/UnitTest/TestLogger.cpp b/libc/test/UnitTest/TestLogger.cpp
+index e1df7987bcd8..3d95d48a5f5a 100644
+--- a/libc/test/UnitTest/TestLogger.cpp
++++ b/libc/test/UnitTest/TestLogger.cpp
+@@ -1,14 +1,13 @@
+ #include "test/UnitTest/TestLogger.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/string.h"
+ #include "src/__support/CPP/string_view.h"
+-#include "src/__support/OSUtil/io.h"               // write_to_stderr
+-#include "src/__support/big_int.h"                 // is_big_int
++#include "src/__support/OSUtil/io.h" // write_to_stderr
++#include "src/__support/big_int.h"   // is_big_int
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+ #include "src/__support/uint128.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+ 
+diff --git a/libc/test/integration/src/pthread/CMakeLists.txt b/libc/test/integration/src/pthread/CMakeLists.txt
+index 208ba3fd4350..0bdd99c253fe 100644
+--- a/libc/test/integration/src/pthread/CMakeLists.txt
++++ b/libc/test/integration/src/pthread/CMakeLists.txt
+@@ -7,6 +7,7 @@ add_integration_test(
+   SRCS
+     pthread_mutex_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.pthread
+     libc.src.errno.errno
+     libc.src.pthread.pthread_mutex_destroy
+@@ -84,6 +85,7 @@ add_integration_test(
+   SRCS
+     pthread_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.pthread
+     libc.src.pthread.pthread_create
+     libc.src.pthread.pthread_join
+@@ -96,6 +98,7 @@ add_integration_test(
+   SRCS
+     pthread_equal_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.pthread
+     libc.src.errno.errno
+     libc.src.pthread.pthread_mutex_destroy
+@@ -115,6 +118,7 @@ add_integration_test(
+   SRCS
+     pthread_name_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.pthread
+     libc.src.errno.errno
+     libc.src.pthread.pthread_create
+@@ -165,6 +169,7 @@ add_integration_test(
+   SRCS
+     pthread_once_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.pthread
+     libc.src.pthread.pthread_once
+     libc.src.pthread.pthread_mutex_destroy
+diff --git a/libc/test/integration/src/pthread/pthread_equal_test.cpp b/libc/test/integration/src/pthread/pthread_equal_test.cpp
+index 82f5a494e1bd..01569798537e 100644
+--- a/libc/test/integration/src/pthread/pthread_equal_test.cpp
++++ b/libc/test/integration/src/pthread/pthread_equal_test.cpp
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h" // uintptr_t
+ #include "src/pthread/pthread_create.h"
+ #include "src/pthread/pthread_equal.h"
+ #include "src/pthread/pthread_join.h"
+@@ -14,11 +15,9 @@
+ #include "src/pthread/pthread_mutex_lock.h"
+ #include "src/pthread/pthread_mutex_unlock.h"
+ #include "src/pthread/pthread_self.h"
+-
+ #include "test/IntegrationTest/test.h"
+ 
+ #include <pthread.h>
+-#include <stdint.h> // uintptr_t
+ 
+ pthread_t child_thread;
+ pthread_mutex_t mutex;
+diff --git a/libc/test/integration/src/pthread/pthread_mutex_test.cpp b/libc/test/integration/src/pthread/pthread_mutex_test.cpp
+index 137daed6bd28..d41ad6daf426 100644
+--- a/libc/test/integration/src/pthread/pthread_mutex_test.cpp
++++ b/libc/test/integration/src/pthread/pthread_mutex_test.cpp
+@@ -6,18 +6,16 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h" // uintptr_t
++#include "src/pthread/pthread_create.h"
++#include "src/pthread/pthread_join.h"
+ #include "src/pthread/pthread_mutex_destroy.h"
+ #include "src/pthread/pthread_mutex_init.h"
+ #include "src/pthread/pthread_mutex_lock.h"
+ #include "src/pthread/pthread_mutex_unlock.h"
+-
+-#include "src/pthread/pthread_create.h"
+-#include "src/pthread/pthread_join.h"
+-
+ #include "test/IntegrationTest/test.h"
+ 
+ #include <pthread.h>
+-#include <stdint.h> // uintptr_t
+ 
+ constexpr int START = 0;
+ constexpr int MAX = 10000;
+diff --git a/libc/test/integration/src/pthread/pthread_name_test.cpp b/libc/test/integration/src/pthread/pthread_name_test.cpp
+index 35dd3b165e0e..343a22356593 100644
+--- a/libc/test/integration/src/pthread/pthread_name_test.cpp
++++ b/libc/test/integration/src/pthread/pthread_name_test.cpp
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h" // uintptr_t
+ #include "src/__support/CPP/string_view.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/pthread/pthread_create.h"
+@@ -17,11 +18,9 @@
+ #include "src/pthread/pthread_mutex_unlock.h"
+ #include "src/pthread/pthread_self.h"
+ #include "src/pthread/pthread_setname_np.h"
+-
+ #include "test/IntegrationTest/test.h"
+ 
+ #include <pthread.h>
+-#include <stdint.h> // uintptr_t
+ 
+ using string_view = LIBC_NAMESPACE::cpp::string_view;
+ 
+diff --git a/libc/test/integration/src/pthread/pthread_once_test.cpp b/libc/test/integration/src/pthread/pthread_once_test.cpp
+index 8e0f234ee8c6..68aadf3bdc18 100644
+--- a/libc/test/integration/src/pthread/pthread_once_test.cpp
++++ b/libc/test/integration/src/pthread/pthread_once_test.cpp
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h" // uintptr_t
+ #include "src/__support/CPP/atomic.h"
+ #include "src/pthread/pthread_create.h"
+ #include "src/pthread/pthread_join.h"
+@@ -14,11 +15,9 @@
+ #include "src/pthread/pthread_mutex_lock.h"
+ #include "src/pthread/pthread_mutex_unlock.h"
+ #include "src/pthread/pthread_once.h"
+-
+ #include "test/IntegrationTest/test.h"
+ 
+ #include <pthread.h>
+-#include <stdint.h> // uintptr_t
+ 
+ static constexpr unsigned int NUM_THREADS = 5;
+ static LIBC_NAMESPACE::cpp::Atomic<unsigned int> thread_count;
+diff --git a/libc/test/integration/src/pthread/pthread_test.cpp b/libc/test/integration/src/pthread/pthread_test.cpp
+index 3565a7cb0868..4d635fd9a96d 100644
+--- a/libc/test/integration/src/pthread/pthread_test.cpp
++++ b/libc/test/integration/src/pthread/pthread_test.cpp
+@@ -6,12 +6,12 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h" // uintptr_t
+ #include "src/pthread/pthread_create.h"
+ #include "src/pthread/pthread_join.h"
+ #include "test/IntegrationTest/test.h"
+ 
+ #include <pthread.h>
+-#include <stdint.h> // uintptr_t
+ 
+ static constexpr int thread_count = 1000;
+ static int counter = 0;
+diff --git a/libc/test/integration/src/spawn/CMakeLists.txt b/libc/test/integration/src/spawn/CMakeLists.txt
+index caed59fc1b7a..f3a7327cc005 100644
+--- a/libc/test/integration/src/spawn/CMakeLists.txt
++++ b/libc/test/integration/src/spawn/CMakeLists.txt
+@@ -29,6 +29,7 @@ add_integration_test(
+   DEPENDS
+     libc_posix_spawn_test_binary
+     libc.test.integration.src.spawn.test_binary_properties
++    libc.hdr.stdint_proxy
+     libc.include.fcntl
+     libc.include.signal
+     libc.include.spawn
+diff --git a/libc/test/integration/src/spawn/posix_spawn_test.cpp b/libc/test/integration/src/spawn/posix_spawn_test.cpp
+index 104096c40c3a..3c8cc7317b53 100644
+--- a/libc/test/integration/src/spawn/posix_spawn_test.cpp
++++ b/libc/test/integration/src/spawn/posix_spawn_test.cpp
+@@ -8,6 +8,7 @@
+ 
+ #include "test_binary_properties.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/spawn/posix_spawn.h"
+ #include "src/spawn/posix_spawn_file_actions_addopen.h"
+ #include "src/spawn/posix_spawn_file_actions_destroy.h"
+@@ -18,7 +19,6 @@
+ #include <fcntl.h>
+ #include <spawn.h>
+ #include <stddef.h>
+-#include <stdint.h>
+ #include <sys/wait.h>
+ 
+ char arg0[] = "libc_posix_spawn_test_binary";
+diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
+index 6dca47b5343e..b63092102fe5 100644
+--- a/libc/test/src/CMakeLists.txt
++++ b/libc/test/src/CMakeLists.txt
+@@ -51,7 +51,9 @@ function(add_fp_unittest name)
+     ${test_type}
+     LINK_LIBRARIES "${MATH_UNITTEST_LINK_LIBRARIES}"
+     "${MATH_UNITTEST_UNPARSED_ARGUMENTS}"
+-    DEPENDS "${deps}"
++    DEPENDS
++      libc.hdr.stdint_proxy
++      "${deps}"
+   )
+ endfunction(add_fp_unittest)
+ 
+diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
+index e54d7a5c9638..5d1d0e0e5316 100644
+--- a/libc/test/src/__support/CMakeLists.txt
++++ b/libc/test/src/__support/CMakeLists.txt
+@@ -265,6 +265,7 @@ add_libc_test(
+   SRCS
+     str_to_float_comparison_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.stdio.printf
+     libc.src.stdio.fopen
+     libc.src.stdio.fclose
+diff --git a/libc/test/src/__support/CPP/CMakeLists.txt b/libc/test/src/__support/CPP/CMakeLists.txt
+index 2b4d6107b767..3e1379d812c3 100644
+--- a/libc/test/src/__support/CPP/CMakeLists.txt
++++ b/libc/test/src/__support/CPP/CMakeLists.txt
+@@ -27,6 +27,7 @@ add_libc_test(
+   SRCS
+     bit_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.big_int
+     libc.src.__support.CPP.bit
+     libc.src.__support.macros.properties.types
+diff --git a/libc/test/src/__support/CPP/bit_test.cpp b/libc/test/src/__support/CPP/bit_test.cpp
+index 89e2a75e0680..891e693e0c95 100644
+--- a/libc/test/src/__support/CPP/bit_test.cpp
++++ b/libc/test/src/__support/CPP/bit_test.cpp
+@@ -6,14 +6,13 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/big_int.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace cpp {
+ 
+diff --git a/libc/test/src/__support/HashTable/CMakeLists.txt b/libc/test/src/__support/HashTable/CMakeLists.txt
+index f84835fe95c7..55e9864c792f 100644
+--- a/libc/test/src/__support/HashTable/CMakeLists.txt
++++ b/libc/test/src/__support/HashTable/CMakeLists.txt
+@@ -30,6 +30,7 @@ add_libc_test(
+   SRCS
+     group_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.HashTable.bitmask
+     libc.src.stdlib.rand
+     libc.src.search.hsearch
+diff --git a/libc/test/src/__support/HashTable/group_test.cpp b/libc/test/src/__support/HashTable/group_test.cpp
+index acdc58e20585..890ca001b0d3 100644
+--- a/libc/test/src/__support/HashTable/group_test.cpp
++++ b/libc/test/src/__support/HashTable/group_test.cpp
+@@ -8,11 +8,11 @@
+ 
+ #include "src/__support/HashTable/bitmask.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/__support/macros/config.h"
+ #include "src/stdlib/rand.h"
+ #include "test/UnitTest/Test.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace internal {
+diff --git a/libc/test/src/__support/str_to_float_comparison_test.cpp b/libc/test/src/__support/str_to_float_comparison_test.cpp
+index 6e89ce2aabf3..18be22af00d8 100644
+--- a/libc/test/src/__support/str_to_float_comparison_test.cpp
++++ b/libc/test/src/__support/str_to_float_comparison_test.cpp
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/bit.h"
+ #include "src/stdio/fclose.h"
+ #include "src/stdio/fgets.h"
+@@ -17,7 +18,6 @@
+ #include "src/string/strdup.h"
+ #include "src/string/strtok.h"
+ #include "test/UnitTest/Test.h"
+-#include <stdint.h>
+ 
+ // The intent of this test is to read in files in the format used in this test
+ // dataset: https://github.com/nigeltao/parse-number-fxx-test-data
+diff --git a/libc/test/src/fenv/feclearexcept_test.cpp b/libc/test/src/fenv/feclearexcept_test.cpp
+index 52adda46adf2..f39cf4eca05c 100644
+--- a/libc/test/src/fenv/feclearexcept_test.cpp
++++ b/libc/test/src/fenv/feclearexcept_test.cpp
+@@ -13,7 +13,6 @@
+ #include "test/UnitTest/Test.h"
+ 
+ #include "hdr/fenv_macros.h"
+-#include <stdint.h>
+ 
+ #include "excepts.h"
+ 
+diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
+index 08dce0d4d478..cff2aebcba48 100644
+--- a/libc/test/src/math/LdExpTest.h
++++ b/libc/test/src/math/LdExpTest.h
+@@ -17,7 +17,7 @@
+ #include "test/UnitTest/Test.h"
+ 
+ #include "hdr/math_macros.h"
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LIBC_NAMESPACE::Sign;
+ 
+diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
+index aa0128fee999..6e2bed75a7ff 100644
+--- a/libc/test/src/math/acosf_test.cpp
++++ b/libc/test/src/math/acosf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/acosf.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+ 
+ using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+diff --git a/libc/test/src/math/acoshf16_test.cpp b/libc/test/src/math/acoshf16_test.cpp
+index 2eb95215e4e8..d190cd1a5dcb 100644
+--- a/libc/test/src/math/acoshf16_test.cpp
++++ b/libc/test/src/math/acoshf16_test.cpp
+@@ -6,12 +6,12 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/acoshf16.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+-#include <stdint.h>
+ 
+ using LlvmLibcAcoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
+index 3d3b827411a4..3305445fb733 100644
+--- a/libc/test/src/math/acoshf_test.cpp
++++ b/libc/test/src/math/acoshf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/acoshf.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
+index 1eaa6b8a5135..f2963f25f40f 100644
+--- a/libc/test/src/math/asinf_test.cpp
++++ b/libc/test/src/math/asinf_test.cpp
+@@ -8,6 +8,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/asinf.h"
+@@ -15,8 +16,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
+index 8c78f939cabf..f9dfb0a2d521 100644
+--- a/libc/test/src/math/asinhf_test.cpp
++++ b/libc/test/src/math/asinhf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/asinhf.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
+index a4bdf1867c39..dc489fef9356 100644
+--- a/libc/test/src/math/atanf_test.cpp
++++ b/libc/test/src/math/atanf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/atanf.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/atanhf16_test.cpp b/libc/test/src/math/atanhf16_test.cpp
+index e35cc775b060..4c3530480632 100644
+--- a/libc/test/src/math/atanhf16_test.cpp
++++ b/libc/test/src/math/atanhf16_test.cpp
+@@ -6,11 +6,11 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/math/atanhf16.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+-#include <stdint.h>
+ 
+ using LlvmLibcAtanhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
+index 32272ef482ab..1ec7b6b9c525 100644
+--- a/libc/test/src/math/atanhf_test.cpp
++++ b/libc/test/src/math/atanhf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/atanhf.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ using LIBC_NAMESPACE::Sign;
+ 
+diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
+index 90dc8ff6a0ea..e2b444f87084 100644
+--- a/libc/test/src/math/cosf_test.cpp
++++ b/libc/test/src/math/cosf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/cosf.h"
+@@ -15,8 +16,6 @@
+ #include "test/src/math/sdcomp26094.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
+ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
+index bdaba50f1f14..98f9a82f4198 100644
+--- a/libc/test/src/math/coshf_test.cpp
++++ b/libc/test/src/math/coshf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+@@ -15,8 +16,6 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
+index f9b03377daa9..c40aacf852bc 100644
+--- a/libc/test/src/math/erff_test.cpp
++++ b/libc/test/src/math/erff_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcErffTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
+index 6126e5f211ff..ee8fe9b5fb28 100644
+--- a/libc/test/src/math/exp10_test.cpp
++++ b/libc/test/src/math/exp10_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExp10Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
+index 89915961c9b9..e20214239112 100644
+--- a/libc/test/src/math/exp10f_test.cpp
++++ b/libc/test/src/math/exp10f_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp
+index 01802bd68f7e..613fdebab596 100644
+--- a/libc/test/src/math/exp10m1f_test.cpp
++++ b/libc/test/src/math/exp10m1f_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
+index 4cd95dd5486e..7866037c9c54 100644
+--- a/libc/test/src/math/exp2_test.cpp
++++ b/libc/test/src/math/exp2_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExp2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
+index aeecb3e74b07..349a4c370418 100644
+--- a/libc/test/src/math/exp2f_test.cpp
++++ b/libc/test/src/math/exp2f_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
+index 0c87657abc08..a323bdcb94cf 100644
+--- a/libc/test/src/math/exp2m1f_test.cpp
++++ b/libc/test/src/math/exp2m1f_test.cpp
+@@ -15,7 +15,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
+index 83addaeb943d..d328d09cd869 100644
+--- a/libc/test/src/math/exp_test.cpp
++++ b/libc/test/src/math/exp_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExpTest = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
+index 3c10812ff5bc..8329d7475741 100644
+--- a/libc/test/src/math/expf_test.cpp
++++ b/libc/test/src/math/expf_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
+index 0cf07e2e4973..5d546ded13df 100644
+--- a/libc/test/src/math/expm1_test.cpp
++++ b/libc/test/src/math/expm1_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExpm1Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
+index cf3fe9c26ae1..d5c98be142fb 100644
+--- a/libc/test/src/math/expm1f_test.cpp
++++ b/libc/test/src/math/expm1f_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/in_float_range_test_helper.h b/libc/test/src/math/in_float_range_test_helper.h
+index 35e039e74af5..21791822013f 100644
+--- a/libc/test/src/math/in_float_range_test_helper.h
++++ b/libc/test/src/math/in_float_range_test_helper.h
+@@ -5,7 +5,7 @@
+ #ifndef LLVM_LIBC_TEST_SRC_MATH_IN_FLOAT_RANGE_TEST_HELPER_H
+ #define LLVM_LIBC_TEST_SRC_MATH_IN_FLOAT_RANGE_TEST_HELPER_H
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ #define CHECK_DATA(start, stop, mfp_op, f, f_check, count, prec)               \
+   {                                                                            \
+diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
+index e9529d87c388..7d087d4eb9ed 100644
+--- a/libc/test/src/math/log10_test.cpp
++++ b/libc/test/src/math/log10_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLog10Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/log10f_test.cpp b/libc/test/src/math/log10f_test.cpp
+index b2c1c283e08e..c554948f7e64 100644
+--- a/libc/test/src/math/log10f_test.cpp
++++ b/libc/test/src/math/log10f_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLog10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
+index e5747b7e5ec0..4d1efe701430 100644
+--- a/libc/test/src/math/log1p_test.cpp
++++ b/libc/test/src/math/log1p_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLog1pTest = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
+index ffe2dd2c33dd..0badbeb5ae28 100644
+--- a/libc/test/src/math/log1pf_test.cpp
++++ b/libc/test/src/math/log1pf_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLog1pfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
+index fc440c09b42b..dd55bd8b9764 100644
+--- a/libc/test/src/math/log2_test.cpp
++++ b/libc/test/src/math/log2_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLog2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
+index 92226c763f45..78544893c520 100644
+--- a/libc/test/src/math/log2f_test.cpp
++++ b/libc/test/src/math/log2f_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLog2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
+index 54afaa33d135..be156fe5987c 100644
+--- a/libc/test/src/math/log_test.cpp
++++ b/libc/test/src/math/log_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLogTest = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
+index 79d8275856b6..7658075c1738 100644
+--- a/libc/test/src/math/logf_test.cpp
++++ b/libc/test/src/math/logf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcLogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/performance_testing/Timer.h b/libc/test/src/math/performance_testing/Timer.h
+index 32578ade4888..f75c1f0285e4 100644
+--- a/libc/test/src/math/performance_testing/Timer.h
++++ b/libc/test/src/math/performance_testing/Timer.h
+@@ -9,8 +9,8 @@
+ #ifndef LLVM_LIBC_TEST_SRC_MATH_PERFORMACE_TESTING_TIMER_H
+ #define LLVM_LIBC_TEST_SRC_MATH_PERFORMACE_TESTING_TIMER_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/macros/config.h"
+-#include <stdint.h>
+ 
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+diff --git a/libc/test/src/math/performance_testing/fmodf16_perf.cpp b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
+index f7c492cb7779..407ed7cb6009 100644
+--- a/libc/test/src/math/performance_testing/fmodf16_perf.cpp
++++ b/libc/test/src/math/performance_testing/fmodf16_perf.cpp
+@@ -11,7 +11,7 @@
+ #include "src/__support/FPUtil/generic/FMod.h"
+ #include "src/__support/macros/properties/types.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ #define FMOD_FUNC(U) (LIBC_NAMESPACE::fputil::generic::FMod<float16, U>::eval)
+ 
+diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
+index 4d189d813e58..1d7072463e0b 100644
+--- a/libc/test/src/math/powf_test.cpp
++++ b/libc/test/src/math/powf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcPowfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ using LIBC_NAMESPACE::testing::tlog;
+diff --git a/libc/test/src/math/sdcomp26094.h b/libc/test/src/math/sdcomp26094.h
+index bb2b9f1efb90..b08cfd2936c6 100644
+--- a/libc/test/src/math/sdcomp26094.h
++++ b/libc/test/src/math/sdcomp26094.h
+@@ -9,11 +9,10 @@
+ #ifndef LLVM_LIBC_TEST_SRC_MATH_SDCOMP26094_H
+ #define LLVM_LIBC_TEST_SRC_MATH_SDCOMP26094_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/macros/config.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+ 
+diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
+index ad2155f329cd..c4dde4b5365f 100644
+--- a/libc/test/src/math/sincosf_test.cpp
++++ b/libc/test/src/math/sincosf_test.cpp
+@@ -15,7 +15,7 @@
+ #include "test/src/math/sdcomp26094.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
+index e0357e6157fd..c4676d9bb539 100644
+--- a/libc/test/src/math/sinf_test.cpp
++++ b/libc/test/src/math/sinf_test.cpp
+@@ -15,7 +15,7 @@
+ #include "test/src/math/sdcomp26094.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
+index 74f906ebaa98..a66401056bd4 100644
+--- a/libc/test/src/math/sinhf_test.cpp
++++ b/libc/test/src/math/sinhf_test.cpp
+@@ -15,7 +15,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp
+index 986c676761f0..18bc40118182 100644
+--- a/libc/test/src/math/sinpif_test.cpp
++++ b/libc/test/src/math/sinpif_test.cpp
+@@ -12,7 +12,7 @@
+ #include "test/src/math/sdcomp26094.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
+index 094cc7e4e9e7..8de70ad161ad 100644
+--- a/libc/test/src/math/smoke/LdExpTest.h
++++ b/libc/test/src/math/smoke/LdExpTest.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_TEST_SRC_MATH_LDEXPTEST_H
+ #define LLVM_LIBC_TEST_SRC_MATH_LDEXPTEST_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/limits.h" // INT_MAX
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/FPUtil/NormalFloat.h"
+@@ -16,8 +17,6 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LIBC_NAMESPACE::Sign;
+ 
+ template <typename T, typename U = int>
+diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp
+index 257c6a3d1d22..733610a03937 100644
+--- a/libc/test/src/math/smoke/acosf_test.cpp
++++ b/libc/test/src/math/smoke/acosf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp
+index b6abfab99929..19556b27e542 100644
+--- a/libc/test/src/math/smoke/acoshf_test.cpp
++++ b/libc/test/src/math/smoke/acoshf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp
+index 2615a8ddd16b..6195a1126694 100644
+--- a/libc/test/src/math/smoke/asinf_test.cpp
++++ b/libc/test/src/math/smoke/asinf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp
+index d812a2dffe8a..e6326c116023 100644
+--- a/libc/test/src/math/smoke/asinhf_test.cpp
++++ b/libc/test/src/math/smoke/asinhf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp
+index b56b9d0162b9..7d2dfee7edf3 100644
+--- a/libc/test/src/math/smoke/atanf_test.cpp
++++ b/libc/test/src/math/smoke/atanf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp
+index 038cb30d89a4..5588ae00dece 100644
+--- a/libc/test/src/math/smoke/atanhf_test.cpp
++++ b/libc/test/src/math/smoke/atanhf_test.cpp
+@@ -13,7 +13,7 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LIBC_NAMESPACE::Sign;
+ 
+diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp
+index 470a876c63a7..837fee9aadfa 100644
+--- a/libc/test/src/math/smoke/cosf_test.cpp
++++ b/libc/test/src/math/smoke/cosf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/cosf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcCosfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp
+index ee8f0199df3b..81096faaf815 100644
+--- a/libc/test/src/math/smoke/coshf_test.cpp
++++ b/libc/test/src/math/smoke/coshf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp
+index 3d48909cca93..6f1bb24b4ec3 100644
+--- a/libc/test/src/math/smoke/cospif_test.cpp
++++ b/libc/test/src/math/smoke/cospif_test.cpp
+@@ -6,12 +6,11 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/cospif.h"
+ #include "test/UnitTest/FPMatcher.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcCospifTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/erff_test.cpp b/libc/test/src/math/smoke/erff_test.cpp
+index a9f4994d77bb..133b4b823673 100644
+--- a/libc/test/src/math/smoke/erff_test.cpp
++++ b/libc/test/src/math/smoke/erff_test.cpp
+@@ -7,13 +7,12 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/math/erff.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcErffTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcErffTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp
+index 50d3de0c7fe7..76e6c7844c12 100644
+--- a/libc/test/src/math/smoke/exp10_test.cpp
++++ b/libc/test/src/math/smoke/exp10_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/exp10.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExp10Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcExp10Test, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp
+index fcd334bb9e36..cf2f976958e2 100644
+--- a/libc/test/src/math/smoke/exp10f_test.cpp
++++ b/libc/test/src/math/smoke/exp10f_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/exp10f.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp
+index aebf80835072..3d26df1ee710 100644
+--- a/libc/test/src/math/smoke/exp2_test.cpp
++++ b/libc/test/src/math/smoke/exp2_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/exp2.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExp2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcExp2Test, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp
+index c5243273d9ed..12bcbec78430 100644
+--- a/libc/test/src/math/smoke/exp2f_test.cpp
++++ b/libc/test/src/math/smoke/exp2f_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/exp2f.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp
+index c3b2ae70e1d9..4ce322736534 100644
+--- a/libc/test/src/math/smoke/exp_test.cpp
++++ b/libc/test/src/math/smoke/exp_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/exp.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExpTest = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcExpTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp
+index d34151735afa..a0e785f80a1d 100644
+--- a/libc/test/src/math/smoke/expf_test.cpp
++++ b/libc/test/src/math/smoke/expf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/expf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp
+index c842fe3c45fe..db7149dc7ae5 100644
+--- a/libc/test/src/math/smoke/expm1_test.cpp
++++ b/libc/test/src/math/smoke/expm1_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/expm1.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExpm1Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcExpm1Test, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp
+index 214bfe8abd4d..9482bf6fd4b0 100644
+--- a/libc/test/src/math/smoke/expm1f_test.cpp
++++ b/libc/test/src/math/smoke/expm1f_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/expm1f.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp
+index 49cfda85111a..3af27d47437a 100644
+--- a/libc/test/src/math/smoke/log10_test.cpp
++++ b/libc/test/src/math/smoke/log10_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/log10.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLog10Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcLog10Test, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/log10f_test.cpp b/libc/test/src/math/smoke/log10f_test.cpp
+index a63822140e9b..f15da755c17e 100644
+--- a/libc/test/src/math/smoke/log10f_test.cpp
++++ b/libc/test/src/math/smoke/log10f_test.cpp
+@@ -7,13 +7,12 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/math/log10f.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLog10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcLog10fTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp
+index dc3489fddf99..82c2f9465f4f 100644
+--- a/libc/test/src/math/smoke/log1pf_test.cpp
++++ b/libc/test/src/math/smoke/log1pf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/log1pf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLog1pfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcLog1pfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp
+index 0534d00b1f40..6bf1ce3249a6 100644
+--- a/libc/test/src/math/smoke/log2_test.cpp
++++ b/libc/test/src/math/smoke/log2_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/log2.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLog2Test = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcLog2Test, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp
+index 53d54ac36763..80e13a2374ad 100644
+--- a/libc/test/src/math/smoke/log2f_test.cpp
++++ b/libc/test/src/math/smoke/log2f_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/log2f.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLog2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcLog2fTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp
+index 09e9ab0a9a4d..1d7761a56803 100644
+--- a/libc/test/src/math/smoke/log_test.cpp
++++ b/libc/test/src/math/smoke/log_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/log.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLogTest = LIBC_NAMESPACE::testing::FPTest<double>;
+ 
+ TEST_F(LlvmLibcLogTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/logf_test.cpp b/libc/test/src/math/smoke/logf_test.cpp
+index faba50e9b240..f58209e26f05 100644
+--- a/libc/test/src/math/smoke/logf_test.cpp
++++ b/libc/test/src/math/smoke/logf_test.cpp
+@@ -7,13 +7,12 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/math/logf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcLogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcLogfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/powf_test.cpp b/libc/test/src/math/smoke/powf_test.cpp
+index 0d1a650385fb..76e8e8cee7a9 100644
+--- a/libc/test/src/math/smoke/powf_test.cpp
++++ b/libc/test/src/math/smoke/powf_test.cpp
+@@ -7,13 +7,12 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/math/powf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcPowfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode;
+ using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp
+index 8ba0d04347bb..7ff0ba7cdefc 100644
+--- a/libc/test/src/math/smoke/sincosf_test.cpp
++++ b/libc/test/src/math/smoke/sincosf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/sincosf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp
+index 8173969fb256..8ba66ed4a807 100644
+--- a/libc/test/src/math/smoke/sinf_test.cpp
++++ b/libc/test/src/math/smoke/sinf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/sinf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp
+index ea6a4474a780..5976a1f16944 100644
+--- a/libc/test/src/math/smoke/sinhf_test.cpp
++++ b/libc/test/src/math/smoke/sinhf_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+@@ -14,8 +15,6 @@
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcSinhfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/sinpif_test.cpp b/libc/test/src/math/smoke/sinpif_test.cpp
+index b840f3980eda..4a725e02ffec 100644
+--- a/libc/test/src/math/smoke/sinpif_test.cpp
++++ b/libc/test/src/math/smoke/sinpif_test.cpp
+@@ -6,12 +6,11 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/sinpif.h"
+ #include "test/UnitTest/FPMatcher.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcSinpifTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp
+index 12deca5cf941..c85907c835f0 100644
+--- a/libc/test/src/math/smoke/tanf_test.cpp
++++ b/libc/test/src/math/smoke/tanf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/tanf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcTanfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp
+index b12a331b3190..57c05735c543 100644
+--- a/libc/test/src/math/smoke/tanhf_test.cpp
++++ b/libc/test/src/math/smoke/tanhf_test.cpp
+@@ -7,14 +7,13 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/math_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/FPUtil/FPBits.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/math/tanhf.h"
+ #include "test/UnitTest/FPMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+ TEST_F(LlvmLibcTanhfTest, SpecialNumbers) {
+diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
+index ecc70194b649..27949eb5ec90 100644
+--- a/libc/test/src/math/tanf_test.cpp
++++ b/libc/test/src/math/tanf_test.cpp
+@@ -15,7 +15,7 @@
+ #include "test/src/math/sdcomp26094.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
+index 966ce649e2b3..27a6cbcb8013 100644
+--- a/libc/test/src/math/tanhf_test.cpp
++++ b/libc/test/src/math/tanhf_test.cpp
+@@ -14,7 +14,7 @@
+ #include "test/UnitTest/Test.h"
+ #include "utils/MPFRWrapper/MPFRUtils.h"
+ 
+-#include <stdint.h>
++#include "hdr/stdint_proxy.h"
+ 
+ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
+ 
+diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt
+index f86ce2ae9685..6b5041d1dedd 100644
+--- a/libc/test/src/signal/CMakeLists.txt
++++ b/libc/test/src/signal/CMakeLists.txt
+@@ -119,6 +119,7 @@ add_libc_unittest(
+     sigaltstack_test.cpp
+   DEPENDS
+     libc.hdr.signal_macros
++    libc.hdr.stdint_proxy
+     libc.src.errno.errno
+     libc.src.signal.raise
+     libc.src.signal.sigaltstack
+diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp
+index ce4dfddae248..a9c5cd939581 100644
+--- a/libc/test/src/signal/sigaltstack_test.cpp
++++ b/libc/test/src/signal/sigaltstack_test.cpp
+@@ -7,6 +7,7 @@
+ //===----------------------------------------------------------------------===//
+ 
+ #include "hdr/signal_macros.h"
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+ #include "src/__support/libc_errno.h"
+ #include "src/signal/linux/signal_utils.h"
+@@ -16,7 +17,6 @@
+ #include "test/UnitTest/ErrnoSetterMatcher.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+ #include <sys/syscall.h>
+ 
+ constexpr int LOCAL_VAR_SIZE = 512;
+diff --git a/libc/test/src/spawn/CMakeLists.txt b/libc/test/src/spawn/CMakeLists.txt
+index ce246d4a1b77..04814db46dca 100644
+--- a/libc/test/src/spawn/CMakeLists.txt
++++ b/libc/test/src/spawn/CMakeLists.txt
+@@ -7,6 +7,7 @@ add_libc_unittest(
+   SRCS
+     posix_spawn_file_actions_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.spawn
+     libc.src.spawn.file_actions
+     libc.src.spawn.posix_spawn_file_actions_addclose
+diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
+index 01ccb8218ee2..935a3540d9a5 100644
+--- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
++++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp
+@@ -6,6 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/spawn/file_actions.h"
+ #include "src/spawn/posix_spawn_file_actions_addclose.h"
+@@ -16,7 +17,6 @@
+ #include "test/UnitTest/Test.h"
+ 
+ #include <spawn.h>
+-#include <stdint.h>
+ 
+ TEST(LlvmLibcPosixSpawnFileActionsTest, AddActions) {
+   posix_spawn_file_actions_t actions;
+diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
+index 45fd49b6d352..0eb373c3fa06 100644
+--- a/libc/test/src/stdlib/CMakeLists.txt
++++ b/libc/test/src/stdlib/CMakeLists.txt
+@@ -99,6 +99,7 @@ add_libc_test(
+   SRCS
+     strtoint32_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.str_to_integer
+     libc.src.errno.errno
+     .strtol_test_support
+@@ -111,6 +112,7 @@ add_libc_test(
+   SRCS
+     strtoint64_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.str_to_integer
+     libc.src.errno.errno
+     .strtol_test_support
+@@ -380,6 +382,7 @@ add_libc_test(
+   SRCS
+     memalignment_test.cpp
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.include.stdlib
+     libc.src.stdlib.memalignment
+ )
+diff --git a/libc/test/src/stdlib/memalignment_test.cpp b/libc/test/src/stdlib/memalignment_test.cpp
+index 2ca1b79e69e3..f1640e996e26 100644
+--- a/libc/test/src/stdlib/memalignment_test.cpp
++++ b/libc/test/src/stdlib/memalignment_test.cpp
+@@ -6,11 +6,10 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/stdlib/memalignment.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ TEST(LlvmLibcMemAlignmentTest, NullPointer) {
+   void *ptr = nullptr;
+   EXPECT_EQ(LIBC_NAMESPACE::memalignment(ptr), static_cast<size_t>(0));
+diff --git a/libc/test/src/stdlib/strtoint32_test.cpp b/libc/test/src/stdlib/strtoint32_test.cpp
+index e6da692714d2..1bf199412e58 100644
+--- a/libc/test/src/stdlib/strtoint32_test.cpp
++++ b/libc/test/src/stdlib/strtoint32_test.cpp
+@@ -6,8 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
+-#include <stdint.h>
+-
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/str_to_integer.h"
+diff --git a/libc/test/src/stdlib/strtoint64_test.cpp b/libc/test/src/stdlib/strtoint64_test.cpp
+index 2c5d948f5fae..2b009d7cea69 100644
+--- a/libc/test/src/stdlib/strtoint64_test.cpp
++++ b/libc/test/src/stdlib/strtoint64_test.cpp
+@@ -6,8 +6,7 @@
+ //
+ //===----------------------------------------------------------------------===//
+ 
+-#include <stdint.h>
+-
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/libc_errno.h"
+ #include "src/__support/macros/config.h"
+ #include "src/__support/str_to_integer.h"
+diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
+index 8374be4a1d01..d6d6d09fc100 100644
+--- a/libc/test/src/string/memory_utils/CMakeLists.txt
++++ b/libc/test/src/string/memory_utils/CMakeLists.txt
+@@ -9,6 +9,7 @@ add_libc_test(
+   COMPILE_OPTIONS
+     ${LIBC_COMPILE_OPTIONS_NATIVE}
+   DEPENDS
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.array
+     libc.src.__support.CPP.cstddef
+     libc.src.__support.CPP.span
+diff --git a/libc/test/src/string/memory_utils/memory_check_utils.h b/libc/test/src/string/memory_utils/memory_check_utils.h
+index db9cfda672b2..c38039ebd3dd 100644
+--- a/libc/test/src/string/memory_utils/memory_check_utils.h
++++ b/libc/test/src/string/memory_utils/memory_check_utils.h
+@@ -9,13 +9,13 @@
+ #ifndef LIBC_TEST_SRC_STRING_MEMORY_UTILS_MEMORY_CHECK_UTILS_H
+ #define LIBC_TEST_SRC_STRING_MEMORY_UTILS_MEMORY_CHECK_UTILS_H
+ 
++#include "hdr/stdint_proxy.h" // uintxx_t
+ #include "src/__support/CPP/span.h"
+ #include "src/__support/libc_assert.h" // LIBC_ASSERT
+ #include "src/__support/macros/config.h"
+ #include "src/__support/macros/sanitizer.h"
+ #include "src/string/memory_utils/utils.h"
+ #include <stddef.h> // size_t
+-#include <stdint.h> // uintxx_t
+ #include <stdlib.h> // malloc/free
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/test/src/string/memory_utils/protected_pages.h b/libc/test/src/string/memory_utils/protected_pages.h
+index 50cfbaa37937..65578d106ce5 100644
+--- a/libc/test/src/string/memory_utils/protected_pages.h
++++ b/libc/test/src/string/memory_utils/protected_pages.h
+@@ -18,9 +18,9 @@
+ #error "Protected pages requires mmap and cannot be used in full build mode."
+ #endif // defined(LIBC_FULL_BUILD) || !defined(LIBC_TARGET_OS_IS_LINUX)
+ 
++#include "hdr/stdint_proxy.h"                // uint8_t
+ #include "src/__support/macros/attributes.h" // LIBC_INLINE
+ #include <stddef.h>                          // size_t
+-#include <stdint.h>                          // uint8_t
+ #include <sys/mman.h>                        // mmap, munmap
+ #include <unistd.h>                          // sysconf, _SC_PAGESIZE
+ 
+diff --git a/libc/utils/MPCWrapper/CMakeLists.txt b/libc/utils/MPCWrapper/CMakeLists.txt
+index 7120eaf18e65..2df8cfa349be 100644
+--- a/libc/utils/MPCWrapper/CMakeLists.txt
++++ b/libc/utils/MPCWrapper/CMakeLists.txt
+@@ -10,6 +10,7 @@ if(LIBC_TESTS_CAN_USE_MPC)
+   add_dependencies(
+     libcMPCWrapper
+     libcMPCommon
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.array
+     libc.src.__support.CPP.string
+     libc.src.__support.CPP.stringstream
+diff --git a/libc/utils/MPCWrapper/MPCUtils.cpp b/libc/utils/MPCWrapper/MPCUtils.cpp
+index 7c7821d8d423..11e93ec5cb89 100644
+--- a/libc/utils/MPCWrapper/MPCUtils.cpp
++++ b/libc/utils/MPCWrapper/MPCUtils.cpp
+@@ -8,13 +8,12 @@
+ 
+ #include "MPCUtils.h"
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/array.h"
+ #include "src/__support/CPP/stringstream.h"
+ #include "utils/MPCWrapper/mpc_inc.h"
+ #include "utils/MPFRWrapper/MPCommon.h"
+ 
+-#include <stdint.h>
+-
+ template <typename T> using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+ 
+ namespace LIBC_NAMESPACE_DECL {
+diff --git a/libc/utils/MPCWrapper/MPCUtils.h b/libc/utils/MPCWrapper/MPCUtils.h
+index d4c57240c751..d4ab672bcddb 100644
+--- a/libc/utils/MPCWrapper/MPCUtils.h
++++ b/libc/utils/MPCWrapper/MPCUtils.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_UTILS_MPCWRAPPER_MPCUTILS_H
+ #define LLVM_LIBC_UTILS_MPCWRAPPER_MPCUTILS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/complex_type.h"
+ #include "src/__support/macros/config.h"
+@@ -17,8 +18,6 @@
+ #include "test/UnitTest/RoundingModeUtils.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+ namespace mpc {
+diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
+index bac43474a9c7..2669b23ba43f 100644
+--- a/libc/utils/MPFRWrapper/CMakeLists.txt
++++ b/libc/utils/MPFRWrapper/CMakeLists.txt
+@@ -10,6 +10,7 @@ if(LIBC_TESTS_CAN_USE_MPFR OR LIBC_TESTS_CAN_USE_MPC)
+   target_compile_options(libcMPCommon PRIVATE -O3 ${compile_options})
+   add_dependencies(
+     libcMPCommon
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.string
+     libc.src.__support.CPP.string_view
+     libc.src.__support.CPP.type_traits
+@@ -39,6 +40,7 @@ if(LIBC_TESTS_CAN_USE_MPFR)
+   add_dependencies(
+     libcMPFRWrapper
+     libcMPCommon
++    libc.hdr.stdint_proxy
+     libc.src.__support.CPP.array
+     libc.src.__support.CPP.stringstream
+     libc.src.__support.FPUtil.fp_bits
+diff --git a/libc/utils/MPFRWrapper/MPCommon.h b/libc/utils/MPFRWrapper/MPCommon.h
+index af03ff054e1a..8bcc69c247a3 100644
+--- a/libc/utils/MPFRWrapper/MPCommon.h
++++ b/libc/utils/MPFRWrapper/MPCommon.h
+@@ -9,6 +9,7 @@
+ #ifndef LLVM_LIBC_UTILS_MPFRWRAPPER_MPCOMMON_H
+ #define LLVM_LIBC_UTILS_MPFRWRAPPER_MPCOMMON_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/string.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/FPUtil/FPBits.h"
+@@ -16,8 +17,6 @@
+ #include "src/__support/macros/properties/types.h"
+ #include "test/UnitTest/RoundingModeUtils.h"
+ 
+-#include <stdint.h>
+-
+ #include "mpfr_inc.h"
+ 
+ #ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
+index c77a6aa3adea..45468c6cb19a 100644
+--- a/libc/utils/MPFRWrapper/MPFRUtils.h
++++ b/libc/utils/MPFRWrapper/MPFRUtils.h
+@@ -9,13 +9,12 @@
+ #ifndef LLVM_LIBC_UTILS_MPFRWRAPPER_MPFRUTILS_H
+ #define LLVM_LIBC_UTILS_MPFRWRAPPER_MPFRUTILS_H
+ 
++#include "hdr/stdint_proxy.h"
+ #include "src/__support/CPP/type_traits.h"
+ #include "src/__support/macros/config.h"
+ #include "test/UnitTest/RoundingModeUtils.h"
+ #include "test/UnitTest/Test.h"
+ 
+-#include <stdint.h>
+-
+ namespace LIBC_NAMESPACE_DECL {
+ namespace testing {
+ namespace mpfr {
+diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+index cd608a1352a7..31de58049d2f 100644
+--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+@@ -281,6 +281,12 @@ libc_support_library(
+     hdrs = ["hdr/wchar_overlay.h"],
+ )
+ 
++libc_support_library(
++    name = "hdr_stdint_proxy",
++    hdrs = ["hdr/stdint_proxy.h"],
++)
++
++
+ ############################ Type Proxy Header Files ###########################
+ 
+ libc_support_library(
+@@ -557,6 +563,7 @@ libc_support_library(
+         ":__support_macros_attributes",
+         ":__support_macros_config",
+         ":__support_macros_properties_architectures",
++        ":hdr_stdint_proxy",
+     ],
+ )
+ 
+diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+index 090bd1b1ba7a..156515309d8c 100644
+--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+@@ -22,6 +22,7 @@ libc_test_library(
+         "//libc:__support_macros_properties_types",
+         "//libc:__support_osutil_io",
+         "//libc:__support_uint128",
++        "//libc:hdr_stdint_proxy",
+         "//libc:func_aligned_alloc",
+         "//libc:func_free",
+         "//libc:func_malloc",
+@@ -64,6 +65,7 @@ libc_test_library(
+         "//libc:__support_macros_properties_types",
+         "//libc:__support_stringutil",
+         "//libc:__support_uint128",
++        "//libc:hdr_stdint_proxy",
+         "//libc:errno",
+         "//libc:func_aligned_alloc",
+         "//libc:func_free",
+@@ -121,6 +123,7 @@ libc_test_library(
+         "//libc:__support_macros_properties_architectures",
+         "//libc:hdr_fenv_macros",
+         "//libc:hdr_math_macros",
++        "//libc:hdr_stdint_proxy",
+         "//libc:types_fenv_t",
+     ],
+ )
+@@ -157,6 +160,7 @@ libc_test_library(
+         ":string_utils",
+         "//libc:__support_fputil_fp_bits",
+         "//libc:__support_macros_config",
++        "//libc:hdr_stdint_proxy",
+         "//libc:printf_core_structs",
+     ],
+ )
+diff --git a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
+index 41a706d2d92f..8cb7821d71e2 100644
+--- a/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
++++ b/utils/bazel/llvm-project-overlay/libc/test/libc_test_rules.bzl
+@@ -44,6 +44,7 @@ def libc_test(
+       **kwargs: Attributes relevant for a cc_test.
+     """
+     deps = deps + [
++        "//libc:hdr_stdint_proxy",
+         "//libc:__support_macros_config",
+         "//libc:__support_libc_errno",
+         "//libc:errno",
+-- 
+2.43.0
+
diff --git a/arm-software/embedded/patches/llvm-project/0010-libc-Add-missing-libc.include.inttypes-for-targets-i.patch b/arm-software/embedded/patches/llvm-project/0010-libc-Add-missing-libc.include.inttypes-for-targets-i.patch
new file mode 100644
index 000000000000..b234f875a56e
--- /dev/null
+++ b/arm-software/embedded/patches/llvm-project/0010-libc-Add-missing-libc.include.inttypes-for-targets-i.patch
@@ -0,0 +1,46 @@
+From 577b913a170345070e47bcd311763495f6718dab Mon Sep 17 00:00:00 2001
+From: lntue <lntue@google.com>
+Date: Thu, 24 Jul 2025 04:39:33 +0000
+Subject: [libc] Add missing libc.include.inttypes for targets including
+ <inttypes.h>. (#150345)
+
+---
+ libc/include/CMakeLists.txt               | 1 +
+ libc/src/stdio/printf_core/CMakeLists.txt | 2 ++
+ 2 files changed, 3 insertions(+)
+
+diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
+index ce811d17784a..b31e1237bec7 100644
+--- a/libc/include/CMakeLists.txt
++++ b/libc/include/CMakeLists.txt
+@@ -188,6 +188,7 @@ add_header_macro(
+   arpa/inet.h
+   DEPENDS
+     .llvm_libc_common_h
++    .inttypes
+ )
+ 
+ add_header_macro(
+diff --git a/libc/src/stdio/printf_core/CMakeLists.txt b/libc/src/stdio/printf_core/CMakeLists.txt
+index c22f9858f3b1..76eb0a2fdaaa 100644
+--- a/libc/src/stdio/printf_core/CMakeLists.txt
++++ b/libc/src/stdio/printf_core/CMakeLists.txt
+@@ -44,6 +44,7 @@ add_header_library(
+   HDRS
+     core_structs.h
+   DEPENDS
++    libc.include.inttypes
+     libc.src.__support.CPP.string_view
+     libc.src.__support.FPUtil.fp_bits
+ )
+@@ -97,6 +98,7 @@ add_header_library(
+     .core_structs
+     .printf_config
+     .writer
++    libc.include.inttypes
+     libc.src.__support.big_int
+     libc.src.__support.common
+     libc.src.__support.CPP.limits
+-- 
+2.43.0
+
diff --git a/arm-software/embedded/patches/llvm-project/0011-libc-Add-misssing-inttypes-dependencies-150861.patch b/arm-software/embedded/patches/llvm-project/0011-libc-Add-misssing-inttypes-dependencies-150861.patch
new file mode 100644
index 000000000000..afa7b6fc2bd4
--- /dev/null
+++ b/arm-software/embedded/patches/llvm-project/0011-libc-Add-misssing-inttypes-dependencies-150861.patch
@@ -0,0 +1,47 @@
+From 3f6191b21a48cab6093e94d0496fe9ac99048022 Mon Sep 17 00:00:00 2001
+From: Haowei <haowei@google.com>
+Date: Sun, 27 Jul 2025 18:57:04 -0700
+Subject: [libc] Add misssing inttypes dependencies (#150861)
+
+This is a follow up of 9e7999147de757107482d8a2cedab4155a0b6635. It
+attempts to fix the CI flakes we saw on fuchsia-linux-x64 builder.
+---
+ libc/src/stdio/baremetal/CMakeLists.txt  | 1 +
+ libc/src/stdio/scanf_core/CMakeLists.txt | 2 ++
+ 2 files changed, 3 insertions(+)
+
+diff --git a/libc/src/stdio/baremetal/CMakeLists.txt b/libc/src/stdio/baremetal/CMakeLists.txt
+index e879230a9d02..548938f885c9 100644
+--- a/libc/src/stdio/baremetal/CMakeLists.txt
++++ b/libc/src/stdio/baremetal/CMakeLists.txt
+@@ -72,6 +72,7 @@ add_entrypoint_object(
+     ../scanf.h
+   DEPENDS
+     .scanf_internal
++    libc.include.inttypes
+     libc.src.stdio.scanf_core.scanf_main
+     libc.src.__support.arg_list
+     libc.src.__support.OSUtil.osutil
+diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
+index dee125c234a1..561180c6100b 100644
+--- a/libc/src/stdio/scanf_core/CMakeLists.txt
++++ b/libc/src/stdio/scanf_core/CMakeLists.txt
+@@ -35,6 +35,7 @@ add_header_library(
+     core_structs.h
+   DEPENDS
+     .scanf_config
++    libc.include.inttypes
+     libc.src.__support.CPP.string_view
+     libc.src.__support.CPP.bitset
+     libc.src.__support.FPUtil.fp_bits
+@@ -97,6 +98,7 @@ add_header_library(
+   DEPENDS
+     .reader
+     .core_structs
++    libc.include.inttypes
+     libc.src.__support.common
+     libc.src.__support.ctype_utils
+     libc.src.__support.CPP.bitset
+-- 
+2.43.0
+
diff --git a/arm-software/embedded/patches/picolibc/0004-Revert-Delete-dummyhost.patch b/arm-software/embedded/patches/picolibc/0004-Revert-Delete-dummyhost.patch
new file mode 100644
index 000000000000..05ccc6bc66bd
--- /dev/null
+++ b/arm-software/embedded/patches/picolibc/0004-Revert-Delete-dummyhost.patch
@@ -0,0 +1,231 @@
+From 12b4ed4c5744c995e901bb54e659109f9e03efb4 Mon Sep 17 00:00:00 2001
+From: Mark Murray <mark.murray@arm.com>
+Date: Mon, 4 Aug 2025 11:21:18 +0100
+Subject: [PATCH] Revert "Delete 'dummyhost'."
+
+Needed for the 21.x release to buy us time to properly fix
+https://jira.arm.com/browse/LLVMREQ-935
+
+This reverts commit 6972284de615b135522f0eb247a2cf04f7021ba4.
+---
+ dummyhost/exit.c      | 43 ++++++++++++++++++++++++
+ dummyhost/iob.c       | 76 +++++++++++++++++++++++++++++++++++++++++++
+ dummyhost/meson.build | 58 +++++++++++++++++++++++++++++++++
+ meson.build           |  3 ++
+ 4 files changed, 180 insertions(+)
+ create mode 100644 dummyhost/exit.c
+ create mode 100644 dummyhost/iob.c
+ create mode 100644 dummyhost/meson.build
+
+diff --git a/dummyhost/exit.c b/dummyhost/exit.c
+new file mode 100644
+index 000000000..eee23f5c8
+--- /dev/null
++++ b/dummyhost/exit.c
+@@ -0,0 +1,43 @@
++/*
++ * SPDX-License-Identifier: BSD-3-Clause
++ *
++ * Copyright © 2023 Keith Packard
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ *    copyright notice, this list of conditions and the following
++ *    disclaimer in the documentation and/or other materials provided
++ *    with the distribution.
++ *
++ * 3. Neither the name of the copyright holder nor the names of its
++ *    contributors may be used to endorse or promote products derived
++ *    from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
++ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
++ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
++ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
++ * OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <unistd.h>
++
++__noreturn void
++_exit(int code)
++{
++    (void) code;
++    for (;;);
++}
+diff --git a/dummyhost/iob.c b/dummyhost/iob.c
+new file mode 100644
+index 000000000..81d60e479
+--- /dev/null
++++ b/dummyhost/iob.c
+@@ -0,0 +1,76 @@
++/*
++ * SPDX-License-Identifier: BSD-3-Clause
++ *
++ * Copyright © 2019 Keith Packard
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright
++ *    notice, this list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above
++ *    copyright notice, this list of conditions and the following
++ *    disclaimer in the documentation and/or other materials provided
++ *    with the distribution.
++ *
++ * 3. Neither the name of the copyright holder nor the names of its
++ *    contributors may be used to endorse or promote products derived
++ *    from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
++ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
++ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
++ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
++ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
++ * OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include <stdio.h>
++
++/*
++ * Dummy stdio hooks. This allows programs to link without requiring
++ * any system-dependent functions. This is only used if the program
++ * doesn't provide its own version of stdin, stdout, stderr
++ */
++
++static int
++dummy_putc(char c, FILE *file)
++{
++	(void) c;
++	(void) file;
++	return (unsigned char) c;
++}
++
++static int
++dummy_getc(FILE *file)
++{
++	(void) file;
++	return EOF;
++}
++
++static int
++dummy_flush(FILE *file)
++{
++	(void) file;
++	return 0;
++}
++
++static FILE __stdio = FDEV_SETUP_STREAM(dummy_putc, dummy_getc, dummy_flush, _FDEV_SETUP_RW);
++
++#ifdef __strong_reference
++#define STDIO_ALIAS(x) __strong_reference(stdin, x);
++#else
++#define STDIO_ALIAS(x) FILE *const x = &__stdio;
++#endif
++
++FILE *const stdin = &__stdio;
++STDIO_ALIAS(stdout);
++STDIO_ALIAS(stderr);
+diff --git a/dummyhost/meson.build b/dummyhost/meson.build
+new file mode 100644
+index 000000000..1acaabd73
+--- /dev/null
++++ b/dummyhost/meson.build
+@@ -0,0 +1,58 @@
++#
++# SPDX-License-Identifier: BSD-3-Clause
++#
++# Copyright © 2019 Keith Packard
++#
++# Redistribution and use in source and binary forms, with or without
++# modification, are permitted provided that the following conditions
++# are met:
++#
++# 1. Redistributions of source code must retain the above copyright
++#    notice, this list of conditions and the following disclaimer.
++#
++# 2. Redistributions in binary form must reproduce the above
++#    copyright notice, this list of conditions and the following
++#    disclaimer in the documentation and/or other materials provided
++#    with the distribution.
++#
++# 3. Neither the name of the copyright holder nor the names of its
++#    contributors may be used to endorse or promote products derived
++#    from this software without specific prior written permission.
++#
++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
++# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
++# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
++# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
++# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
++# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
++# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
++# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
++# OF THE POSSIBILITY OF SUCH DAMAGE.
++#
++
++srcs_dummyhost = [
++  'iob.c',
++  'exit.c',
++]
++
++foreach params : targets
++  target = params['name']
++  target_dir = params['dir']
++  target_c_args = params['c_args']
++  target_lib_prefix = params['lib_prefix']
++
++  instdir = join_paths(lib_dir, target_dir)
++
++  libdummyhost_name = 'dummyhost'
++
++  set_variable('lib_dummyhost' + target,
++	       static_library(join_paths(target_dir, target_lib_prefix + libdummyhost_name),
++			      srcs_dummyhost,
++			      install : true,
++			      install_dir : instdir,
++			      include_directories : inc,
++			      c_args : target_c_args + c_args))
++endforeach
+diff --git a/meson.build b/meson.build
+index 00f8ba8b1..710d4be25 100644
+--- a/meson.build
++++ b/meson.build
+@@ -1558,6 +1558,9 @@ bios_bin = []
+ if enable_semihost
+   subdir('semihost')
+ endif
++if tinystdio
++  subdir('dummyhost')
++endif
+ 
+ conf_data.set('__SEMIHOST', has_semihost, description: 'Semihost APIs supported')
+ conf_data.set('__ARM_SEMIHOST', has_arm_semihost, description: 'ARM Semihost APIs supported')
+-- 
+2.34.1
+
diff --git a/arm-software/embedded/scripts/install_dependencies.sh b/arm-software/embedded/scripts/install_dependencies.sh
index 222c68bd3714..ab55bd9eb59b 100755
--- a/arm-software/embedded/scripts/install_dependencies.sh
+++ b/arm-software/embedded/scripts/install_dependencies.sh
@@ -16,7 +16,7 @@ sudo apt-get update && sudo apt-get install -y --no-install-recommends \
     ninja-build=1.10.1-1 \
     python3-pip \
     python3-setuptools \
-    qemu-system-arm=1:6.2+dfsg-2ubuntu6.26
+    qemu-system-arm=1:6.2+dfsg-2ubuntu6.27 \
 
 # Upgrade pip and install meson with a pinned version
 python3 -m pip install --upgrade pip
diff --git a/arm-software/embedded/scripts/test_atfe_sanitizer.sh b/arm-software/embedded/scripts/test_atfe_sanitizer.sh
index cd6d19c55447..7de096f4331e 100755
--- a/arm-software/embedded/scripts/test_atfe_sanitizer.sh
+++ b/arm-software/embedded/scripts/test_atfe_sanitizer.sh
@@ -39,8 +39,8 @@ ninja -j$PROCESSOR_COUNT check-all
 # The picolibc tests do not use lit so do not support this option.
 # Command for each test is splitted across individual lines, to aid in debugging.
 export LIT_OPTS="--ignore-fail"
-ninja -j$PROCESSOR_COUNT check-compiler-rt-armv7m_hard_fpv5_d16_exn_rtti_unaligned
-ninja -j$PROCESSOR_COUNT check-picolibc-armv7m_hard_fpv5_d16_exn_rtti_unaligned
-ninja -j$PROCESSOR_COUNT check-cxx-armv7m_hard_fpv5_d16_exn_rtti_unaligned
-ninja -j$PROCESSOR_COUNT check-cxxabi-armv7m_hard_fpv5_d16_exn_rtti_unaligned
-ninja -j$PROCESSOR_COUNT check-unwind-armv7m_hard_fpv5_d16_exn_rtti_unaligned
+ninja -j$PROCESSOR_COUNT check-compiler-rt-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
+ninja -j$PROCESSOR_COUNT check-picolibc-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
+ninja -j$PROCESSOR_COUNT check-cxx-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
+ninja -j$PROCESSOR_COUNT check-cxxabi-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
+ninja -j$PROCESSOR_COUNT check-unwind-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
diff --git a/arm-software/embedded/scripts/test_post-merge.sh b/arm-software/embedded/scripts/test_post-merge.sh
index a26f964e99bb..c2d813fc1517 100755
--- a/arm-software/embedded/scripts/test_post-merge.sh
+++ b/arm-software/embedded/scripts/test_post-merge.sh
@@ -40,13 +40,13 @@ export LIT_OPTS="--ignore-fail"
 ninja -j$PROCESSOR_COUNT \
     check-all \
     check-compiler-rt-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-compiler-rt-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-compiler-rt-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-cxx-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-cxx-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-cxx-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-cxxabi-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-cxxabi-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-cxxabi-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-picolibc-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-picolibc-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-picolibc-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-unwind-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-unwind-armv7m_hard_fpv5_d16_exn_rtti_unaligned
+    check-unwind-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
 
diff --git a/arm-software/embedded/scripts/test_pre-merge.sh b/arm-software/embedded/scripts/test_pre-merge.sh
index b057d25f32f3..e2cdccc3ac4e 100755
--- a/arm-software/embedded/scripts/test_pre-merge.sh
+++ b/arm-software/embedded/scripts/test_pre-merge.sh
@@ -40,12 +40,12 @@ export LIT_OPTS="--ignore-fail"
 ninja -j$PROCESSOR_COUNT \
     check-all \
     check-compiler-rt-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-compiler-rt-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-compiler-rt-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-cxx-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-cxx-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-cxx-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-cxxabi-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-cxxabi-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-cxxabi-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-picolibc-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-picolibc-armv7m_hard_fpv5_d16_exn_rtti_unaligned \
+    check-picolibc-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size \
     check-unwind-armv7a_hard_vfpv3_d16_exn_rtti_unaligned \
-    check-unwind-armv7m_hard_fpv5_d16_exn_rtti_unaligned
+    check-unwind-armv7m_hard_fpv5_d16_exn_rtti_unaligned_size
diff --git a/arm-software/embedded/test/multilib/armv4t.test b/arm-software/embedded/test/multilib/armv4t.test
index 9d1020b9ae74..5972d6a8e6c0 100644
--- a/arm-software/embedded/test/multilib/armv4t.test
+++ b/arm-software/embedded/test/multilib/armv4t.test
@@ -1,3 +1,7 @@
-# RUN: %clang -print-multi-directory --target=arm-none-eabi -mfpu=none | FileCheck %s
-# CHECK: arm-none-eabi/armv4t_exn_rtti{{$}}
-# CHECK-EMPTY:
+# RUN: %clang -print-multi-directory --target=arm-none-eabi -mfpu=none | FileCheck %s --check-prefix=ARMV4T-EXN-RTTI
+# ARMV4T-EXN-RTTI: arm-none-eabi/armv4t_exn_rtti_size{{$}}
+# ARMV4T-EXN-RTTI-EMPTY:
+
+# RUN: %clang -print-multi-directory --target=arm-none-eabi -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=ARMV4T
+# ARMV4T: arm-none-eabi/armv4t_size{{$}}
+# ARMV4T-EMPTY:
diff --git a/arm-software/embedded/test/multilib/armv5e.test b/arm-software/embedded/test/multilib/armv5e.test
index 8525f0b01620..43bec5d3fa67 100644
--- a/arm-software/embedded/test/multilib/armv5e.test
+++ b/arm-software/embedded/test/multilib/armv5e.test
@@ -1,3 +1,7 @@
-# RUN: %clang -print-multi-directory --target=armv5e-none-eabi -mfpu=none | FileCheck %s
-# CHECK: arm-none-eabi/armv5te_exn_rtti{{$}}
-# CHECK-EMPTY:
+# RUN: %clang -print-multi-directory --target=armv5e-none-eabi -mfpu=none | FileCheck %s --check-prefix=ARMV5TE-EXN-RTTI
+# ARMV5TE-EXN-RTTI: arm-none-eabi/armv5te_exn_rtti_size{{$}}
+# ARMV5TE-EXN-RTTI-EMPTY:
+
+# RUN: %clang -print-multi-directory --target=armv5e-none-eabi -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=ARMV5TE
+# ARMV5TE: arm-none-eabi/armv5te_size{{$}}
+# ARMV5TE-EMPTY:
diff --git a/arm-software/embedded/test/multilib/armv6m.test b/arm-software/embedded/test/multilib/armv6m.test
index ded79dc4f30a..9ec378d19628 100644
--- a/arm-software/embedded/test/multilib/armv6m.test
+++ b/arm-software/embedded/test/multilib/armv6m.test
@@ -1,19 +1,19 @@
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mfpu=none | FileCheck %s --check-prefix=CHECK-NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mfpu=none -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-NOFP-EXN-RTTI
-# CHECK-NOFP-EXN-RTTI: arm-none-eabi/armv6m_soft_nofp_exn_rtti{{$}}
+# CHECK-NOFP-EXN-RTTI: arm-none-eabi/armv6m_soft_nofp_exn_rtti_size{{$}}
 # CHECK-NOFP-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-NOFP
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-NOFP
-# CHECK-NOFP: arm-none-eabi/armv6m_soft_nofp{{$}}
+# CHECK-NOFP: arm-none-eabi/armv6m_soft_nofp_size{{$}}
 # CHECK-NOFP-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mbig-endian -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-NOFP
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mbig-endian -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-NOFP
-# CHECK-BE-NOFP: arm-none-eabi/armebv6m_soft_nofp{{$}}
+# CHECK-BE-NOFP: arm-none-eabi/armebv6m_soft_nofp_size{{$}}
 # CHECK-BE-NOFP-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mbig-endian -mfpu=none | FileCheck %s --check-prefix=CHECK-BE-NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv6m-none-eabi -mbig-endian -mfpu=none -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-NOFP-EXN-RTTI
-# CHECK-BE-NOFP-EXN-RTTI: arm-none-eabi/armebv6m_soft_nofp_exn_rtti{{$}}
+# CHECK-BE-NOFP-EXN-RTTI: arm-none-eabi/armebv6m_soft_nofp_exn_rtti_size{{$}}
 # CHECK-BE-NOFP-EXN-RTTI-EMPTY:
diff --git a/arm-software/embedded/test/multilib/armv7m.test b/arm-software/embedded/test/multilib/armv7m.test
index 9e3c54d93a79..94d0ce8c92ce 100644
--- a/arm-software/embedded/test/multilib/armv7m.test
+++ b/arm-software/embedded/test/multilib/armv7m.test
@@ -4,13 +4,13 @@
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -munaligned-access | FileCheck %s --check-prefix=NOFP-EXN-RTTI-UNALIGNED
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -mfloat-abi=softfp -munaligned-access | FileCheck %s --check-prefix=NOFP-EXN-RTTI-UNALIGNED
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=none -mfloat-abi=softfp -munaligned-access | FileCheck %s --check-prefix=NOFP-EXN-RTTI-UNALIGNED
-# NOFP-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_soft_nofp_exn_rtti_unaligned{{$}}
+# NOFP-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_soft_nofp_exn_rtti_unaligned_size{{$}}
 # NOFP-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -mno-unaligned-access | FileCheck %s --check-prefix=NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -mfloat-abi=softfp -mno-unaligned-access | FileCheck %s --check-prefix=NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=none -mfloat-abi=softfp -mno-unaligned-access | FileCheck %s --check-prefix=NOFP-EXN-RTTI
-# NOFP-EXN-RTTI: arm-none-eabi/armv7m_soft_nofp_exn_rtti{{$}}
+# NOFP-EXN-RTTI: arm-none-eabi/armv7m_soft_nofp_exn_rtti_size{{$}}
 # NOFP-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=NOFP-UNALIGNED
@@ -19,13 +19,13 @@
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -fno-exceptions -fno-rtti -munaligned-access | FileCheck %s --check-prefix=NOFP-UNALIGNED
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mfloat-abi=softfp -munaligned-access | FileCheck %s --check-prefix=NOFP-UNALIGNED
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mfloat-abi=softfp -munaligned-access | FileCheck %s --check-prefix=NOFP-UNALIGNED
-# NOFP-UNALIGNED: arm-none-eabi/armv7m_soft_nofp_unaligned{{$}}
+# NOFP-UNALIGNED: arm-none-eabi/armv7m_soft_nofp_unaligned_size{{$}}
 # NOFP-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=NOFP
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mfloat-abi=softfp -mno-unaligned-access | FileCheck %s --check-prefix=NOFP
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mfloat-abi=softfp -mno-unaligned-access | FileCheck %s --check-prefix=NOFP
-# NOFP: arm-none-eabi/armv7m_soft_nofp{{$}}
+# NOFP: arm-none-eabi/armv7m_soft_nofp_size{{$}}
 # NOFP-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI-UNALIGNED %s
@@ -36,14 +36,14 @@
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -munaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -munaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -munaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI-UNALIGNED %s
-# SOFT-FPV4-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned{{$}}
+# SOFT-FPV4-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size{{$}}
 # SOFT-FPV4-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4-EXN-RTTI %s
-# SOFT-FPV4-EXN-RTTI: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti{{$}}
+# SOFT-FPV4-EXN-RTTI: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti_size{{$}}
 # SOFT-FPV4-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti | FileCheck --check-prefix=SOFT-FPV4-UNALIGNED %s
@@ -54,91 +54,98 @@
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=SOFT-FPV4-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=SOFT-FPV4-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=SOFT-FPV4-UNALIGNED %s
-# SOFT-FPV4-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_unaligned{{$}}
+# SOFT-FPV4-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_unaligned_size{{$}}
 # SOFT-FPV4-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4 %s
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4 %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv4-sp-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4 %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mfpu=fpv5-d16 -mfloat-abi=softfp -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=SOFT-FPV4 %s
-# SOFT-FPV4: arm-none-eabi/armv7m_soft_fpv4_sp_d16{{$}}
+# SOFT-FPV4: arm-none-eabi/armv7m_soft_fpv4_sp_d16_size{{$}}
 # SOFT-FPV4-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv4-sp-d16 | FileCheck --check-prefix=FPV4-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv4-sp-d16 | FileCheck --check-prefix=FPV4-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv4-sp-d16 -munaligned-access | FileCheck --check-prefix=FPV4-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv4-sp-d16 -munaligned-access | FileCheck --check-prefix=FPV4-EXN-RTTI-UNALIGNED %s
-# FPV4-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned{{$}}
+# FPV4-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti_unaligned_size{{$}}
 # FPV4-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv4-sp-d16 -mno-unaligned-access | FileCheck --check-prefix=FPV4-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv4-sp-d16 -mno-unaligned-access | FileCheck --check-prefix=FPV4-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 -mno-unaligned-access | FileCheck --check-prefix=FPV4-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 -mno-unaligned-access | FileCheck --check-prefix=FPV4-EXN-RTTI %s
-# FPV4-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti{{$}}
+# FPV4-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti_size{{$}}
 # FPV4-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=FPV4-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=FPV4-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=FPV4-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=FPV4-UNALIGNED %s
-# FPV4-UNALIGNED: arm-none-eabi/armv7m_hard_fpv4_sp_d16_unaligned{{$}}
+# FPV4-UNALIGNED: arm-none-eabi/armv7m_hard_fpv4_sp_d16_unaligned_size{{$}}
 # FPV4-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=FPV4 %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=FPV4 %s
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=FPV4 %s
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=FPV4 %s
-# FPV4: arm-none-eabi/armv7m_hard_fpv4_sp_d16{{$}}
+# FPV4: arm-none-eabi/armv7m_hard_fpv4_sp_d16_size{{$}}
 # FPV4-EMPTY:
 
-# RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 | FileCheck --check-prefix=FPV5 %s
-# RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 | FileCheck --check-prefix=FPV5 %s
-# RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 -munaligned-access | FileCheck --check-prefix=FPV5 %s
-# RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 -munaligned-access | FileCheck --check-prefix=FPV5 %s
-# FPV5: arm-none-eabi/armv7m_hard_fpv5_d16_exn_rtti_unaligned{{$}}
+# RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 | FileCheck --check-prefix=FPV5-EXN-RTTI %s
+# RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 | FileCheck --check-prefix=FPV5-EXN-RTTI %s
+# RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 -munaligned-access | FileCheck --check-prefix=FPV5-EXN-RTTI %s
+# RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 -munaligned-access | FileCheck --check-prefix=FPV5-EXN-RTTI %s
+# FPV5-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv5_d16_exn_rtti_unaligned_size{{$}}
+# FPV5-EXN-RTTI-EMPTY:
+
+# RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=FPV5 %s
+# RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=FPV5 %s
+# RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=FPV5 %s
+# RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mfpu=fpv5-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=FPV5 %s
+# FPV5: arm-none-eabi/armv7m_hard_fpv5_d16_unaligned_size{{$}}
 # FPV5-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-FPV4
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-FPV4
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-FPV4
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-FPV4
-# CHECK-BE-FPV4: arm-none-eabi/armebv7m_hard_fpv4_sp_d16{{$}}
+# CHECK-BE-FPV4: arm-none-eabi/armebv7m_hard_fpv4_sp_d16_size{{$}}
 # CHECK-BE-FPV4-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 | FileCheck %s --check-prefix=CHECK-BE-FPV4-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-FPV4-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 | FileCheck %s --check-prefix=CHECK-BE-FPV4-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabihf -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-FPV4-EXN-RTTI
-# CHECK-BE-FPV4-EXN-RTTI: arm-none-eabi/armebv7m_hard_fpv4_sp_d16_exn_rtti{{$}}
+# CHECK-BE-FPV4-EXN-RTTI: arm-none-eabi/armebv7m_hard_fpv4_sp_d16_exn_rtti_size{{$}}
 # CHECK-BE-FPV4-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4
-# CHECK-BE-SOFT-FPV4: arm-none-eabi/armebv7m_soft_fpv4_sp_d16{{$}}
+# CHECK-BE-SOFT-FPV4: arm-none-eabi/armebv7m_soft_fpv4_sp_d16_size{{$}}
 # CHECK-BE-SOFT-FPV4-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=fpv4-sp-d16 -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-FPV4-EXN-RTTI
-# CHECK-BE-SOFT-FPV4-EXN-RTTI: arm-none-eabi/armebv7m_soft_fpv4_sp_d16_exn_rtti{{$}}
+# CHECK-BE-SOFT-FPV4-EXN-RTTI: arm-none-eabi/armebv7m_soft_fpv4_sp_d16_exn_rtti_size{{$}}
 # CHECK-BE-SOFT-FPV4-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP
-# CHECK-BE-SOFT-NOFP: arm-none-eabi/armebv7m_soft_nofp{{$}}
+# CHECK-BE-SOFT-NOFP: arm-none-eabi/armebv7m_soft_nofp_size{{$}}
 # CHECK-BE-SOFT-NOFP-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=none | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7m-none-eabi -mbig-endian -mfpu=none -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=none | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP-EXN-RTTI
 # RUN: %clang -print-multi-directory --target=armv7em-none-eabi -mbig-endian -mfpu=none -mno-unaligned-access | FileCheck %s --check-prefix=CHECK-BE-SOFT-NOFP-EXN-RTTI
-# CHECK-BE-SOFT-NOFP-EXN-RTTI: arm-none-eabi/armebv7m_soft_nofp_exn_rtti{{$}}
+# CHECK-BE-SOFT-NOFP-EXN-RTTI: arm-none-eabi/armebv7m_soft_nofp_exn_rtti_size{{$}}
 # CHECK-BE-SOFT-NOFP-EXN-RTTI-EMPTY:
 
 # %clang -print-multi-directory --target=arm-none-eabi -mcpu=cortex-m3 | FileCheck %s --check-prefix=NONE
diff --git a/arm-software/embedded/test/multilib/armv8.1m.main.test b/arm-software/embedded/test/multilib/armv8.1m.main.test
index 44f9f273f81d..6bea5113b731 100644
--- a/arm-software/embedded/test/multilib/armv8.1m.main.test
+++ b/arm-software/embedded/test/multilib/armv8.1m.main.test
@@ -1,11 +1,11 @@
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -mfpu=none | FileCheck --check-prefix=SOFT-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -mfpu=none -munaligned-access | FileCheck --check-prefix=SOFT-EXN-RTTI-UNALIGNED %s
-# SOFT-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned{{$}}
+# SOFT-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_exn_rtti_unaligned_size{{$}}
 # SOFT-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -mfpu=none -fno-exceptions -fno-rtti | FileCheck --check-prefix=SOFT-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -mfpu=none -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=SOFT-UNALIGNED %s
-# SOFT-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_unaligned{{$}}
+# SOFT-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_unaligned_size{{$}}
 # SOFT-UNALIGNED-EMPTY:
 
 # If both MVE and FPU are present, FPU variant is selected. This is because,
@@ -19,6 +19,13 @@
 # HARD-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned{{$}}
 # HARD-EXN-RTTI-UNALIGNED-EMPTY:
 
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -Os | FileCheck --check-prefix=HARD-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -Os | FileCheck --check-prefix=HARD-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -munaligned-access -Os | FileCheck --check-prefix=HARD-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -munaligned-access -Os | FileCheck --check-prefix=HARD-EXN-RTTI-UNALIGNED-SIZE %s
+# HARD-EXN-RTTI-UNALIGNED-SIZE: arm-none-eabi/armv8.1m.main_hard_fp_nomve_exn_rtti_unaligned_size{{$}}
+# HARD-EXN-RTTI-UNALIGNED-SIZE-EMPTY:
+
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=HARD-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=HARD-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=HARD-UNALIGNED %s
@@ -26,6 +33,13 @@
 # HARD-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fp_nomve_unaligned{{$}}
 # HARD-UNALIGNED-EMPTY:
 
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -Os | FileCheck --check-prefix=HARD-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -Os | FileCheck --check-prefix=HARD-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -munaligned-access -Os | FileCheck --check-prefix=HARD-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -munaligned-access -Os | FileCheck --check-prefix=HARD-UNALIGNED-SIZE %s
+# HARD-UNALIGNED-SIZE: arm-none-eabi/armv8.1m.main_hard_fp_nomve_unaligned_size{{$}}
+# HARD-UNALIGNED-SIZE-EMPTY:
+
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -munaligned-access | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED %s
@@ -33,6 +47,13 @@
 # HARD-FPDP-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned{{$}}
 # HARD-FPDP-EXN-RTTI-UNALIGNED-EMPTY:
 
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -Os | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -Os | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -munaligned-access -Os | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -munaligned-access -Os | FileCheck --check-prefix=HARD-FPDP-EXN-RTTI-UNALIGNED-SIZE %s
+# HARD-FPDP-EXN-RTTI-UNALIGNED-SIZE: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_exn_rtti_unaligned_size{{$}}
+# HARD-FPDP-EXN-RTTI-UNALIGNED-SIZE-EMPTY:
+
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=HARD-FPDP-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=HARD-FPDP-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=HARD-FPDP-UNALIGNED %s
@@ -40,13 +61,25 @@
 # HARD-FPDP-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_unaligned{{$}}
 # HARD-FPDP-UNALIGNED-EMPTY:
 
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -Os | FileCheck --check-prefix=HARD-FPDP-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -Os | FileCheck --check-prefix=HARD-FPDP-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -munaligned-access -Os | FileCheck --check-prefix=HARD-FPDP-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -munaligned-access -Os | FileCheck --check-prefix=HARD-FPDP-UNALIGNED-SIZE %s
+# HARD-FPDP-UNALIGNED-SIZE: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_unaligned_size{{$}}
+# HARD-FPDP-UNALIGNED-SIZE-EMPTY:
+
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none | FileCheck --check-prefix=HARD-NOFP-MVE-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -munaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE-EXN-RTTI-UNALIGNED %s
 # HARD-NOFP-MVE-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned{{$}}
 # HARD-NOFP-MVE-EXN-RTTI-UNALIGNED-EMPTY:
 
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -Os | FileCheck --check-prefix=HARD-NOFP-MVE-EXN-RTTI-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -munaligned-access -Os | FileCheck --check-prefix=HARD-NOFP-MVE-EXN-RTTI-UNALIGNED-SIZE %s
+# HARD-NOFP-MVE-EXN-RTTI-UNALIGNED-SIZE: arm-none-eabi/armv8.1m.main_hard_nofp_mve_exn_rtti_unaligned_size{{$}}
+# HARD-NOFP-MVE-EXN-RTTI-UNALIGNED-SIZE-EMPTY:
+
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -mno-unaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE-EXN-RTTI %s
-# HARD-NOFP-MVE-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_nofp_mve_exn_rtti{{$}}
+# HARD-NOFP-MVE-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_nofp_mve_exn_rtti_size{{$}}
 # HARD-NOFP-MVE-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti | FileCheck --check-prefix=HARD-NOFP-MVE-UNALIGNED %s
@@ -54,123 +87,128 @@
 # HARD-NOFP-MVE-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_nofp_mve_unaligned{{$}}
 # HARD-NOFP-MVE-UNALIGNED-EMPTY:
 
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -Os | FileCheck --check-prefix=HARD-NOFP-MVE-UNALIGNED-SIZE %s
+# RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -munaligned-access -Os | FileCheck --check-prefix=HARD-NOFP-MVE-UNALIGNED-SIZE %s
+# HARD-NOFP-MVE-UNALIGNED-SIZE: arm-none-eabi/armv8.1m.main_hard_nofp_mve_unaligned_size{{$}}
+# HARD-NOFP-MVE-UNALIGNED-SIZE-EMPTY:
+
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE %s
-# HARD-NOFP-MVE: arm-none-eabi/armv8.1m.main_hard_nofp_mve{{$}}
+# HARD-NOFP-MVE: arm-none-eabi/armv8.1m.main_hard_nofp_mve_size{{$}}
 # HARD-NOFP-MVE-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI-EXN-RTTI %s
-# HARD-FPDP-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti{{$}}
+# HARD-FPDP-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_size{{$}}
 # HARD-FPDP-PACBTI-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED %s
-# HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned{{$}}
+# HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_exn_rtti_unaligned_size{{$}}
 # HARD-FPDP-PACBTI-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI %s
-# HARD-FPDP-PACBTI: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti{{$}}
+# HARD-FPDP-PACBTI: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_size{{$}}
 # HARD-FPDP-PACBTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FPDP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FPDP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FPDP-PACBTI-UNALIGNED %s
-# HARD-FPDP-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned{{$}}
+# HARD-FPDP-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fpdp_nomve_pacret_bti_unaligned_size{{$}}
 # HARD-FPDP-PACBTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI-EXN-RTTI %s
-# HARD-FP-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti{{$}}
+# HARD-FP-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_size{{$}}
 # HARD-FP-PACBTI-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI-EXN-RTTI-UNALIGNED %s
-# HARD-FP-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned{{$}}
+# HARD-FP-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_exn_rtti_unaligned_size{{$}}
 # HARD-FP-PACBTI-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI %s
-# HARD-FP-PACBTI: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti{{$}}
+# HARD-FP-PACBTI: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_size{{$}}
 # HARD-FP-PACBTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-FP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-FP-PACBTI-UNALIGNED %s
-# HARD-FP-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned{{$}}
+# HARD-FP-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_fp_nomve_pacret_bti_unaligned_size{{$}}
 # HARD-FP-PACBTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE-PACBTI-EXN-RTTI %s
-# HARD-NOFP-MVE-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti{{$}}
+# HARD-NOFP-MVE-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_size{{$}}
 # HARD-NOFP-MVE-PACBTI-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-NOFP-MVE-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE-PACBTI-EXN-RTTI-UNALIGNED %s
-# HARD-NOFP-MVE-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned{{$}}
+# HARD-NOFP-MVE-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_exn_rtti_unaligned_size{{$}}
 # HARD-NOFP-MVE-PACBTI-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE-PACBTI %s
-# HARD-NOFP-MVE-PACBTI: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti{{$}}
+# HARD-NOFP-MVE-PACBTI: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_size{{$}}
 # HARD-NOFP-MVE-PACBTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=HARD-NOFP-MVE-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=HARD-NOFP-MVE-PACBTI-UNALIGNED %s
-# HARD-NOFP-MVE-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned{{$}}
+# HARD-NOFP-MVE-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_hard_nofp_mve_pacret_bti_unaligned_size{{$}}
 # HARD-NOFP-MVE-PACBTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI-EXN-RTTI %s
-# SOFT-NOFP-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti{{$}}
+# SOFT-NOFP-PACBTI-EXN-RTTI: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_size{{$}}
 # SOFT-NOFP-PACBTI-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main -mfpu=none -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main -mfpu=none -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main+mve -mfpu=none -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED %s
-# SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned{{$}}
+# SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_exn_rtti_unaligned_size{{$}}
 # SOFT-NOFP-PACBTI-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -mno-unaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI %s
-# SOFT-NOFP-PACBTI: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti{{$}}
+# SOFT-NOFP-PACBTI: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_size{{$}}
 # SOFT-NOFP-PACBTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=SOFT-NOFP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti | FileCheck --check-prefix=SOFT-NOFP-PACBTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -march=armv8.1m.main+mve -mfpu=none -fno-exceptions -fno-rtti -mbranch-protection=pac-ret+bti -munaligned-access | FileCheck --check-prefix=SOFT-NOFP-PACBTI-UNALIGNED %s
-# SOFT-NOFP-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned{{$}}
+# SOFT-NOFP-PACBTI-UNALIGNED: arm-none-eabi/armv8.1m.main_soft_nofp_nomve_pacret_bti_unaligned_size{{$}}
 # SOFT-NOFP-PACBTI-UNALIGNED-EMPTY:
 
 
 ### Fallback to Armv7-M ###
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -mfpu=none -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-SOFT-EXN-RTTI %s
-# ARMV7M-SOFT-EXN-RTTI: arm-none-eabi/armv7m_soft_nofp_exn_rtti{{$}}
+# ARMV7M-SOFT-EXN-RTTI: arm-none-eabi/armv7m_soft_nofp_exn_rtti_size{{$}}
 # ARMV7M-SOFT-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-SOFT %s
-# ARMV7M-SOFT: arm-none-eabi/armv7m_soft_nofp{{$}}
+# ARMV7M-SOFT: arm-none-eabi/armv7m_soft_nofp_size{{$}}
 # ARMV7M-SOFT-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD-EXN-RTTI %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD-EXN-RTTI %s
-# ARMV7M-HARD-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti{{$}}
+# ARMV7M-HARD-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti_size{{$}}
 # ARMV7M-HARD-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD %s
 # RUN: %clang -print-multi-directory --target=armv8.1m.main-none-eabihf -march=armv8.1m.main+mve -mfpu=fp-armv8-fullfp16-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD %s
-# ARMV7M-HARD: arm-none-eabi/armv7m_hard_fpv4_sp_d16{{$}}
+# ARMV7M-HARD: arm-none-eabi/armv7m_hard_fpv4_sp_d16_size{{$}}
 # ARMV7M-HARD-EMPTY:
 
 ## This combination is not supported at all, because we don't provide any
diff --git a/arm-software/embedded/test/multilib/armv8m.main.test b/arm-software/embedded/test/multilib/armv8m.main.test
index db47cfd522c7..5d364da7d2bc 100644
--- a/arm-software/embedded/test/multilib/armv8m.main.test
+++ b/arm-software/embedded/test/multilib/armv8m.main.test
@@ -1,53 +1,53 @@
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=none | FileCheck %s --check-prefix=SOFT-EXN-RTTI-UNALIGNED
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=none -munaligned-access | FileCheck %s --check-prefix=SOFT-EXN-RTTI-UNALIGNED
-# SOFT-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8m.main_soft_nofp_exn_rtti_unaligned{{$}}
+# SOFT-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8m.main_soft_nofp_exn_rtti_unaligned_size{{$}}
 # SOFT-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=none -fno-exceptions -fno-rtti | FileCheck %s --check-prefix=SOFT-UNALIGNED
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=none -fno-exceptions -fno-rtti -munaligned-access | FileCheck %s --check-prefix=SOFT-UNALIGNED
-# SOFT-UNALIGNED: arm-none-eabi/armv8m.main_soft_nofp_unaligned{{$}}
+# SOFT-UNALIGNED: arm-none-eabi/armv8m.main_soft_nofp_unaligned_size{{$}}
 # SOFT-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabihf -mfpu=fpv5-sp-d16 | FileCheck --check-prefix=HARD-EXN-RTTI-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabihf -mfpu=fpv5-sp-d16 -munaligned-access | FileCheck --check-prefix=HARD-EXN-RTTI-UNALIGNED %s
-# HARD-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8m.main_hard_fp_exn_rtti_unaligned{{$}}
+# HARD-EXN-RTTI-UNALIGNED: arm-none-eabi/armv8m.main_hard_fp_exn_rtti_unaligned_size{{$}}
 # HARD-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabihf -mfpu=fpv5-sp-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=HARD-UNALIGNED %s
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabihf -mfpu=fpv5-sp-d16 -fno-exceptions -fno-rtti -munaligned-access | FileCheck --check-prefix=HARD-UNALIGNED %s
-# HARD-UNALIGNED: arm-none-eabi/armv8m.main_hard_fp_unaligned{{$}}
+# HARD-UNALIGNED: arm-none-eabi/armv8m.main_hard_fp_unaligned_size{{$}}
 # HARD-UNALIGNED-EMPTY:
 
 # Fallback to armv7m
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=none -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-SOFT-EXN-RTTI %s
-# ARMV7M-SOFT-EXN-RTTI: arm-none-eabi/armv7m_soft_nofp_exn_rtti{{$}}
+# ARMV7M-SOFT-EXN-RTTI: arm-none-eabi/armv7m_soft_nofp_exn_rtti_size{{$}}
 # ARMV7M-SOFT-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=none -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-SOFT %s
-# ARMV7M-SOFT: arm-none-eabi/armv7m_soft_nofp{{$}}
+# ARMV7M-SOFT: arm-none-eabi/armv7m_soft_nofp_size{{$}}
 # ARMV7M-SOFT-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=fpv5-sp-d16 | FileCheck --check-prefix=ARMV7M-SOFTFP-EXN-RTTI-UNALIGNED %s
-# ARMV7M-SOFTFP-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned{{$}}
+# ARMV7M-SOFTFP-EXN-RTTI-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti_unaligned_size{{$}}
 # ARMV7M-SOFTFP-EXN-RTTI-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=fpv5-sp-d16 -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-SOFTFP-EXN-RTTI %s
-# ARMV7M-SOFTFP-EXN-RTTI: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti{{$}}
+# ARMV7M-SOFTFP-EXN-RTTI: arm-none-eabi/armv7m_soft_fpv4_sp_d16_exn_rtti_size{{$}}
 # ARMV7M-SOFTFP-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=fpv5-sp-d16 -fno-exceptions -fno-rtti | FileCheck --check-prefix=ARMV7M-SOFTFP-UNALIGNED %s
-# ARMV7M-SOFTFP-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_unaligned{{$}}
+# ARMV7M-SOFTFP-UNALIGNED: arm-none-eabi/armv7m_soft_fpv4_sp_d16_unaligned_size{{$}}
 # ARMV7M-SOFTFP-UNALIGNED-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabi -mfpu=fpv5-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-SOFTFP %s
-# ARMV7M-SOFTFP: arm-none-eabi/armv7m_soft_fpv4_sp_d16{{$}}
+# ARMV7M-SOFTFP: arm-none-eabi/armv7m_soft_fpv4_sp_d16_size{{$}}
 # ARMV7M-SOFTFP-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabihf -mfpu=fpv5-sp-d16 -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD-EXN-RTTI %s
-# ARMV7M-HARD-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti{{$}}
+# ARMV7M-HARD-EXN-RTTI: arm-none-eabi/armv7m_hard_fpv4_sp_d16_exn_rtti_size{{$}}
 # ARMV7M-HARD-EXN-RTTI-EMPTY:
 
 # RUN: %clang -print-multi-directory --target=armv8m.main-none-eabihf -mfpu=fpv5-sp-d16 -fno-exceptions -fno-rtti -mno-unaligned-access | FileCheck --check-prefix=ARMV7M-HARD %s
-# ARMV7M-HARD: arm-none-eabi/armv7m_hard_fpv4_sp_d16{{$}}
+# ARMV7M-HARD: arm-none-eabi/armv7m_hard_fpv4_sp_d16_size{{$}}
 # ARMV7M-HARD-EMPTY:
 
diff --git a/arm-software/embedded/versions.json b/arm-software/embedded/versions.json
index fdeee1d48a06..0d5af704be57 100644
--- a/arm-software/embedded/versions.json
+++ b/arm-software/embedded/versions.json
@@ -8,8 +8,8 @@
   "repos": {
     "picolibc": {
       "url": "https://github.com/picolibc/picolibc.git",
-      "tagType": "branch",
-      "tag": "main"
+      "tagType": "commithash",
+      "tag": "c0902a4b372c969be10204a9945c2480a395681b"
     },
     "newlib": {
       "url": "https://github.com/bminor/newlib.git",
diff --git a/arm-software/linux/build.sh b/arm-software/linux/build.sh
index 7f83f61cf623..ad5ff6ed65df 100755
--- a/arm-software/linux/build.sh
+++ b/arm-software/linux/build.sh
@@ -34,7 +34,7 @@ INTERACTIVE=false
 ##########################
 
 RELEASE_FLAGS=${RELEASE_FLAGS:-"false"}
-ATFL_VERSION=${ATFL_VERSION:-"0.0"}
+ATFL_VERSION=${ATFL_VERSION:-"21.1"}
 OS_NAME=${OS_NAME:-"linux"}
 TAR_NAME=${TAR_NAME:-"atfl-${ATFL_VERSION}-${OS_NAME}-`uname -m`.tar.gz"}
 ATFL_ASSERTIONS=${ATFL_ASSERTIONS:-"ON"}
diff --git a/arm-software/linux/docs/UsingBOLT.md b/arm-software/linux/docs/UsingBOLT.md
new file mode 100644
index 000000000000..fc579e048507
--- /dev/null
+++ b/arm-software/linux/docs/UsingBOLT.md
@@ -0,0 +1,128 @@
+## How to use BOLT with our toolchain
+
+BOLT is a post-link optimizer available in ATfL. To benefit from it you can use
+the following tools:
+
+* `llvm-bolt`
+* `llvm-bolt-heatmap`
+* `perf2bolt`
+
+### Example optimization of a pathological case
+
+To try this example, download and adapt our Telemetry repository. You can do
+that with the following commands:
+
+```
+$ git clone https://git.gitlab.arm.com/telemetry-solution/telemetry-solution.git
+
+$ git checkout ffaf62bd11d42213fe175fbe3d313a246c85535f
+
+$ cd telemetry-solution/tools/ustress
+```
+
+As BOLT utilizes relocations, you must ensure their emission. In our example,
+this requires the following modification with the `sed` command:
+
+
+```
+$ sed -i 's/^LINKER_FLAGS = -lm$/LINKER_FLAGS = -lm -Wl,--emit-relocs/' Makefile
+```
+
+Also, you must deactivate the assestions:
+
+```
+$ sed -i 's/^\(\s*assert.*\)$/\/\/\1/' l1i_cache_workload.c
+```
+
+Compile the example for a specific CPU. Available alternatives are:
+
+* Neoverse N1 (`CPU=NEOVERSE-N1`)
+* Neoverse N2 (`CPU=NEOVERSE-N2`)
+* Neoverse V1 (`CPU=NEOVERSE-V1`)
+
+The following example command compiles the example for Neoverse V1:
+
+```
+$ make CC=armclang CPU=NEOVERSE-V1 USE_C=1
+```
+
+You can verify, that the binary file contains relocations:
+
+```
+$ llvm-readelf --sections l1i_cache_workload | grep .rela.text
+  [14] .rela.text        RELA            0000000000000000 0f34a0 0004e0 18   I 45  13  8
+```
+
+Record a performance profile from an example execution of this binary:
+
+```
+$ perf record -e cycles:u -- ./l1i_cache_workload 5
+```
+
+Convert this performance profile to the BOLT format using `perf2bolt`:
+
+```
+$ perf2bolt -p perf.data -o perf.boltdata --nl l1i_cache_workload
+```
+
+Optimize this binary with BOLT:
+
+```
+$ llvm-bolt \
+  l1i_cache_workload \
+  -o l1i_cache_workload.bolt \
+  --data perf.boltdata \
+  --dyno-stats \
+  --print-profile-stats \
+  -reorder-blocks=ext-tsp \
+  -reorder-functions=hfsort \
+  -split-functions \
+  -split-all-cold \
+  -split-eh
+```
+
+The name of the new optimized binary is `l1i_cache_workload.bolt`. You can use
+the `repeat` command of the `csh` shell to obtain a set of comparable
+statistics:
+
+```
+$ csh
+
+% repeat 3 perf stat -e L1I_CACHE_REFILL,instructions -- ./l1i_cache_workload 5
+
+% repeat 3 perf stat -e L1I_CACHE_REFILL,instructions -- ./l1i_cache_workload.bolt 5
+
+% exit
+```
+
+Notice how much faster the second binary executes.
+
+Using the `llvm-bolt-heatmap` tool, you can visualize the improvements with
+heatmaps:
+
+```
+$ perf record -e cycles:u -- ./l1i_cache_workload 5
+
+$ llvm-bolt-heatmap -p perf.data --nl l1i_cache_workload -o heatmap-orig.txt
+
+$ perf record -o perf-bolt.data -e cycles:u -- ./l1i_cache_workload.bolt 5
+
+$ llvm-bolt-heatmap -p perf-bolt.data --nl l1i_cache_workload.bolt -o heatmap-bolt.txt
+```
+
+Inspect both heatmaps and observe the difference:
+
+```
+$ less heatmap-orig.txt
+
+$ less heatmap-bolt.txt
+```
+
+As these heatmaps are usually big, we suggest to use the `aha` utility to
+convert them into HTML pages and view them in a web browser:
+
+```
+$ aha -b -f heatmap-orig.txt heatmap-orig.htm
+
+$ aha -b -f heatmap-bolt.txt heatmap-bolt.html
+```
diff --git a/arm-software/linux/docs/index.md b/arm-software/linux/docs/index.md
index 0770f11a1dd5..a0549e1d7ed2 100644
--- a/arm-software/linux/docs/index.md
+++ b/arm-software/linux/docs/index.md
@@ -13,5 +13,6 @@ platforms.
    Installation
    GettingStarted
    UsingArmPL
+   UsingBOLT
    PortingFromACfL
 ```
diff --git a/clang-tools-extra/clangd/ConfigFragment.h b/clang-tools-extra/clangd/ConfigFragment.h
index 9e00dbc9dc90..c5488c2378a1 100644
--- a/clang-tools-extra/clangd/ConfigFragment.h
+++ b/clang-tools-extra/clangd/ConfigFragment.h
@@ -315,7 +315,7 @@ struct Fragment {
     /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and
     /// AngledHeaders), system headers use <> and non-system headers use "".
     /// These can match any suffix of the header file in question.
-    /// Matching is performed against the header text, not its absolute path
+    /// Matching is performed against the absolute path of the header
     /// within the project.
     std::vector<Located<std::string>> QuotedHeaders;
     /// List of regexes for headers that should always be included with a
@@ -323,7 +323,7 @@ struct Fragment {
     /// AngledHeaders (i.e. a header matches a regex in both QuotedHeaders and
     /// AngledHeaders), system headers use <> and non-system headers use "".
     /// These can match any suffix of the header file in question.
-    /// Matching is performed against the header text, not its absolute path
+    /// Matching is performed against the absolute path of the header
     /// within the project.
     std::vector<Located<std::string>> AngledHeaders;
   };
diff --git a/clang-tools-extra/clangd/Headers.cpp b/clang-tools-extra/clangd/Headers.cpp
index 87fd261b906e..b9d67cc6a160 100644
--- a/clang-tools-extra/clangd/Headers.cpp
+++ b/clang-tools-extra/clangd/Headers.cpp
@@ -304,16 +304,17 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader,
   // FIXME: should we allow (some limited number of) "../header.h"?
   if (llvm::sys::path::is_absolute(Suggested))
     return std::nullopt;
+  auto HeaderPath = llvm::sys::path::convert_to_slash(InsertedHeader.File);
   bool IsAngled = false;
   for (auto &Filter : AngledHeaders) {
-    if (Filter(Suggested)) {
+    if (Filter(HeaderPath)) {
       IsAngled = true;
       break;
     }
   }
   bool IsQuoted = false;
   for (auto &Filter : QuotedHeaders) {
-    if (Filter(Suggested)) {
+    if (Filter(HeaderPath)) {
       IsQuoted = true;
       break;
     }
@@ -324,7 +325,7 @@ IncludeInserter::calculateIncludePath(const HeaderFile &InsertedHeader,
     if (IsAngled && IsQuoted) {
       elog("Header '{0}' matches both quoted and angled regexes, default will "
            "be used.",
-           Suggested);
+           HeaderPath);
     }
     IsAngled = IsAngledByDefault;
   }
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index b7c64c7a0674..3c107504e625 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -938,7 +938,7 @@ TEST(CompletionTest, IncludeInsertionRespectsQuotedAngledConfig) {
   {
     Config C;
     C.Style.AngledHeaders.push_back(
-        [](auto header) { return header == "bar.h"; });
+        [](auto header) { return header.contains("bar.h"); });
     WithContextValue WithCfg(Config::Key, std::move(C));
     Results = completions(TU, Test.point(), {Sym});
     EXPECT_THAT(Results.Completions,
@@ -947,7 +947,7 @@ TEST(CompletionTest, IncludeInsertionRespectsQuotedAngledConfig) {
   {
     Config C;
     C.Style.QuotedHeaders.push_back(
-        [](auto header) { return header == "bar.h"; });
+        [](auto header) { return header.contains("bar.h"); });
     WithContextValue WithCfg(Config::Key, std::move(C));
     Results = completions(TU, Test.point(), {Sym});
     EXPECT_THAT(Results.Completions,
diff --git a/clang-tools-extra/clangd/unittests/HeadersTests.cpp b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
index 751383e3b465..440582e14239 100644
--- a/clang-tools-extra/clangd/unittests/HeadersTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HeadersTests.cpp
@@ -344,6 +344,17 @@ TEST_F(HeadersTest, ShortenIncludesInSearchPathBracketed) {
   EXPECT_EQ(calculate(BarHeader), "<sub/bar.h>");
 }
 
+TEST_F(HeadersTest, ShortenIncludesInSearchPathBracketedFilterByFullPath) {
+  // The filter receives the full path of the header, so it is able to filter by
+  // the parent directory, even if it is part of the include search path
+  AngledHeaders.push_back([](auto Path) {
+    llvm::Regex Pattern("sub/.*");
+    return Pattern.match(Path);
+  });
+  std::string BarHeader = testPath("sub/bar.h");
+  EXPECT_EQ(calculate(BarHeader), "<bar.h>");
+}
+
 TEST_F(HeadersTest, ShortenedIncludeNotInSearchPath) {
   std::string BarHeader =
       llvm::sys::path::convert_to_slash(testPath("sub-2/bar.h"));
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index cf97ab708247..b7aa55a2e20d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -46,14 +46,18 @@ Major New Features
 Improvements to clangd
 ----------------------
 
-Inlay hints
-^^^^^^^^^^^
+Language feature support
+^^^^^^^^^^^^^^^^^^^^^^^^
 
-Diagnostics
-^^^^^^^^^^^
+- Performance improvements and bugfixes to C++20 Modules support
+- Improved support for C++23 "deducing this"
+- Improvements to objective-c++ support
 
-Semantic Highlighting
-^^^^^^^^^^^^^^^^^^^^^
+New Language Server Protocol features
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Added support for `textDocument/rangesFormatting`
+- Added support for `positionEncoding`
 
 Compile flags
 ^^^^^^^^^^^^^
@@ -64,24 +68,58 @@ Compile flags
 Hover
 ^^^^^
 
+- Fixed a bug that would sometimes prevent documentation comments of standard library functions
+  from being shown
+
 Code completion
 ^^^^^^^^^^^^^^^
 
-Code actions
-^^^^^^^^^^^^
-
-Signature help
-^^^^^^^^^^^^^^
+- Added `HeaderInsertion` config option to control whether code completion inserts a missing
+  header needed for the symbol being completed. This is equivalent to the `--header-insertion`
+  command-line option.
+- Added a `CodePatterns` config option to control whether code completion should offer code
+  patterns as completions in addition to symbols.
 
 Cross-references
 ^^^^^^^^^^^^^^^^
 
-Objective-C
+- References to symbols are now collected in array designators
+- Find-references now works for operators new and delete
+- Improvements to code navigation in templated code
+
+Call hierarchy
+^^^^^^^^^^^^^^
+
+- Call hierarchy now works with the remote index
+- Fixed a bug where call hierarchy could sometimes return bogus results
+
+Inlay hints
 ^^^^^^^^^^^
 
+- Parameter hint forwarding now works for variadic forwarding functions declared in header files
+- Improved presentation of block-end hints
+
+Code actions
+^^^^^^^^^^^^
+
+- Improved the rename refactor's name collision checking logic
+
+Clang-tidy integration
+^^^^^^^^^^^^^^^^^^^^^^
+
+- Disabled the cppcoreguidelines-macro-to-enum checker which is incompatible with clangd
+
+Include-cleaner integration
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+- Clangd now respects the `AngledHeaders` and `QuotedHeaders` config options for headers
+  inserted to resolve include-cleaner diagnostics
+
 Miscellaneous
 ^^^^^^^^^^^^^
 
+- Various crash fixes and other stability improvements
+
 Improvements to clang-doc
 -------------------------
 
diff --git a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
index 2888e2522675..057b92c14704 100644
--- a/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
+++ b/clang-tools-extra/include-cleaner/include/clang-include-cleaner/Types.h
@@ -136,7 +136,7 @@ struct Header {
   }
   StringRef verbatim() const { return std::get<Verbatim>(Storage); }
 
-  /// For phiscal files, either absolute path or path relative to the execution
+  /// For physical files, either absolute path or path relative to the execution
   /// root. Otherwise just the spelling without surrounding quotes/brackets.
   llvm::StringRef resolvedPath() const;
 
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 0645b1de1db9..e0a7462ca2ce 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -542,6 +542,7 @@ if( CLANG_INCLUDE_TESTS )
     clang_unit_site_config=${CMAKE_CURRENT_BINARY_DIR}/test/Unit/lit.site.cfg
   )
   add_subdirectory(test)
+  add_subdirectory(bindings/python/tests)
 
   if(CLANG_BUILT_STANDALONE)
     umbrella_lit_testsuite_end(check-all)
diff --git a/clang/bindings/python/tests/CMakeLists.txt b/clang/bindings/python/tests/CMakeLists.txt
new file mode 100644
index 000000000000..a0ddabc21bb4
--- /dev/null
+++ b/clang/bindings/python/tests/CMakeLists.txt
@@ -0,0 +1,66 @@
+# Test target to run Python test suite from main build.
+
+# Avoid configurations including '-include' from interfering with
+# our tests by setting CLANG_NO_DEFAULT_CONFIG.
+add_custom_target(check-clang-python
+    COMMAND ${CMAKE_COMMAND} -E env
+            CLANG_NO_DEFAULT_CONFIG=1
+            CLANG_LIBRARY_PATH=$<TARGET_FILE_DIR:libclang>
+            "${Python3_EXECUTABLE}" -m unittest discover
+    DEPENDS libclang
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(RUN_PYTHON_TESTS TRUE)
+set_target_properties(check-clang-python PROPERTIES FOLDER "Clang/Tests")
+
+# Tests require libclang.so which is only built with LLVM_ENABLE_PIC=ON
+if(NOT LLVM_ENABLE_PIC)
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# Do not try to run if libclang was built with sanitizers because
+# the sanitizer library will likely be loaded too late to perform
+# interception and will then fail.
+# We could use LD_PRELOAD/DYLD_INSERT_LIBRARIES but this isn't
+# portable so its easier just to not run the tests when building
+# with ASan.
+if(NOT LLVM_USE_SANITIZER STREQUAL "")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# Tests fail on Windows, and need someone knowledgeable to fix.
+# It's not clear whether it's a test or a valid binding problem.
+if(WIN32)
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# The Python FFI interface is broken on AIX: https://bugs.python.org/issue38628.
+if(${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# AArch64, Hexagon, and Sparc have known test failures that need to be
+# addressed.
+# SystemZ has broken Python/FFI interface:
+# https://reviews.llvm.org/D52840#1265716
+if(${LLVM_NATIVE_ARCH} MATCHES "^(AArch64|Hexagon|Sparc|SystemZ)$")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+# Tests will fail if cross-compiling for a different target, as tests will try
+# to use the host Python3_EXECUTABLE and make FFI calls to functions in target
+# libraries.
+if(CMAKE_CROSSCOMPILING)
+  # FIXME: Consider a solution that allows better control over these tests in
+  # a crosscompiling scenario. e.g. registering them with lit to allow them to
+  # be explicitly skipped via appropriate LIT_ARGS, or adding a mechanism to
+  # allow specifying a python interpreter compiled for the target that could
+  # be executed using qemu-user.
+  message(WARNING "check-clang-python not added to check-all as these tests fail in a cross-build setup")
+  set(RUN_PYTHON_TESTS FALSE)
+endif()
+
+if(RUN_PYTHON_TESTS)
+    set_property(GLOBAL APPEND PROPERTY
+                 LLVM_ALL_ADDITIONAL_TEST_TARGETS check-clang-python)
+endif()
diff --git a/clang/test/bindings/python/tests/__init__.py b/clang/bindings/python/tests/__init__.py
similarity index 100%
rename from clang/test/bindings/python/tests/__init__.py
rename to clang/bindings/python/tests/__init__.py
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/a.inc b/clang/bindings/python/tests/cindex/INPUTS/a.inc
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/a.inc
rename to clang/bindings/python/tests/cindex/INPUTS/a.inc
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/b.inc b/clang/bindings/python/tests/cindex/INPUTS/b.inc
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/b.inc
rename to clang/bindings/python/tests/cindex/INPUTS/b.inc
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/compile_commands.json b/clang/bindings/python/tests/cindex/INPUTS/compile_commands.json
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/compile_commands.json
rename to clang/bindings/python/tests/cindex/INPUTS/compile_commands.json
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/header1.h b/clang/bindings/python/tests/cindex/INPUTS/header1.h
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/header1.h
rename to clang/bindings/python/tests/cindex/INPUTS/header1.h
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/header2.h b/clang/bindings/python/tests/cindex/INPUTS/header2.h
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/header2.h
rename to clang/bindings/python/tests/cindex/INPUTS/header2.h
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/header3.h b/clang/bindings/python/tests/cindex/INPUTS/header3.h
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/header3.h
rename to clang/bindings/python/tests/cindex/INPUTS/header3.h
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/hello.cpp b/clang/bindings/python/tests/cindex/INPUTS/hello.cpp
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/hello.cpp
rename to clang/bindings/python/tests/cindex/INPUTS/hello.cpp
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/include.cpp b/clang/bindings/python/tests/cindex/INPUTS/include.cpp
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/include.cpp
rename to clang/bindings/python/tests/cindex/INPUTS/include.cpp
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/parse_arguments.c b/clang/bindings/python/tests/cindex/INPUTS/parse_arguments.c
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/parse_arguments.c
rename to clang/bindings/python/tests/cindex/INPUTS/parse_arguments.c
diff --git a/clang/test/bindings/python/tests/cindex/INPUTS/testfile.c b/clang/bindings/python/tests/cindex/INPUTS/testfile.c
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/INPUTS/testfile.c
rename to clang/bindings/python/tests/cindex/INPUTS/testfile.c
diff --git a/clang/test/bindings/python/tests/cindex/__init__.py b/clang/bindings/python/tests/cindex/__init__.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/__init__.py
rename to clang/bindings/python/tests/cindex/__init__.py
diff --git a/clang/test/bindings/python/tests/cindex/test_access_specifiers.py b/clang/bindings/python/tests/cindex/test_access_specifiers.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_access_specifiers.py
rename to clang/bindings/python/tests/cindex/test_access_specifiers.py
diff --git a/clang/test/bindings/python/tests/cindex/test_cdb.py b/clang/bindings/python/tests/cindex/test_cdb.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_cdb.py
rename to clang/bindings/python/tests/cindex/test_cdb.py
diff --git a/clang/test/bindings/python/tests/cindex/test_code_completion.py b/clang/bindings/python/tests/cindex/test_code_completion.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_code_completion.py
rename to clang/bindings/python/tests/cindex/test_code_completion.py
diff --git a/clang/test/bindings/python/tests/cindex/test_comment.py b/clang/bindings/python/tests/cindex/test_comment.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_comment.py
rename to clang/bindings/python/tests/cindex/test_comment.py
diff --git a/clang/test/bindings/python/tests/cindex/test_cursor.py b/clang/bindings/python/tests/cindex/test_cursor.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_cursor.py
rename to clang/bindings/python/tests/cindex/test_cursor.py
diff --git a/clang/test/bindings/python/tests/cindex/test_cursor_kind.py b/clang/bindings/python/tests/cindex/test_cursor_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_cursor_kind.py
rename to clang/bindings/python/tests/cindex/test_cursor_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_diagnostics.py b/clang/bindings/python/tests/cindex/test_diagnostics.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_diagnostics.py
rename to clang/bindings/python/tests/cindex/test_diagnostics.py
diff --git a/clang/test/bindings/python/tests/cindex/test_enums.py b/clang/bindings/python/tests/cindex/test_enums.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_enums.py
rename to clang/bindings/python/tests/cindex/test_enums.py
diff --git a/clang/test/bindings/python/tests/cindex/test_exception_specification_kind.py b/clang/bindings/python/tests/cindex/test_exception_specification_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_exception_specification_kind.py
rename to clang/bindings/python/tests/cindex/test_exception_specification_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_file.py b/clang/bindings/python/tests/cindex/test_file.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_file.py
rename to clang/bindings/python/tests/cindex/test_file.py
diff --git a/clang/test/bindings/python/tests/cindex/test_index.py b/clang/bindings/python/tests/cindex/test_index.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_index.py
rename to clang/bindings/python/tests/cindex/test_index.py
diff --git a/clang/test/bindings/python/tests/cindex/test_lib.py b/clang/bindings/python/tests/cindex/test_lib.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_lib.py
rename to clang/bindings/python/tests/cindex/test_lib.py
diff --git a/clang/test/bindings/python/tests/cindex/test_linkage.py b/clang/bindings/python/tests/cindex/test_linkage.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_linkage.py
rename to clang/bindings/python/tests/cindex/test_linkage.py
diff --git a/clang/test/bindings/python/tests/cindex/test_location.py b/clang/bindings/python/tests/cindex/test_location.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_location.py
rename to clang/bindings/python/tests/cindex/test_location.py
diff --git a/clang/test/bindings/python/tests/cindex/test_rewrite.py b/clang/bindings/python/tests/cindex/test_rewrite.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_rewrite.py
rename to clang/bindings/python/tests/cindex/test_rewrite.py
diff --git a/clang/test/bindings/python/tests/cindex/test_source_range.py b/clang/bindings/python/tests/cindex/test_source_range.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_source_range.py
rename to clang/bindings/python/tests/cindex/test_source_range.py
diff --git a/clang/test/bindings/python/tests/cindex/test_tls_kind.py b/clang/bindings/python/tests/cindex/test_tls_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_tls_kind.py
rename to clang/bindings/python/tests/cindex/test_tls_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_token_kind.py b/clang/bindings/python/tests/cindex/test_token_kind.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_token_kind.py
rename to clang/bindings/python/tests/cindex/test_token_kind.py
diff --git a/clang/test/bindings/python/tests/cindex/test_tokens.py b/clang/bindings/python/tests/cindex/test_tokens.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_tokens.py
rename to clang/bindings/python/tests/cindex/test_tokens.py
diff --git a/clang/test/bindings/python/tests/cindex/test_translation_unit.py b/clang/bindings/python/tests/cindex/test_translation_unit.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_translation_unit.py
rename to clang/bindings/python/tests/cindex/test_translation_unit.py
diff --git a/clang/test/bindings/python/tests/cindex/test_type.py b/clang/bindings/python/tests/cindex/test_type.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/test_type.py
rename to clang/bindings/python/tests/cindex/test_type.py
diff --git a/clang/test/bindings/python/tests/cindex/util.py b/clang/bindings/python/tests/cindex/util.py
similarity index 100%
rename from clang/test/bindings/python/tests/cindex/util.py
rename to clang/bindings/python/tests/cindex/util.py
diff --git a/clang/cmake/caches/PGO.cmake b/clang/cmake/caches/PGO.cmake
index 15bc755d110d..d6471160037c 100644
--- a/clang/cmake/caches/PGO.cmake
+++ b/clang/cmake/caches/PGO.cmake
@@ -5,7 +5,7 @@ set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
 set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
 
 set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
-set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "")
+set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE BOOL "")
 set(CLANG_BOOTSTRAP_TARGETS
   generate-profdata
   stage2
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 0e21ef0244f7..cdeaf3b2509c 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -6390,6 +6390,14 @@ the configuration (without a prefix: ``Auto``).
        IF (...)                        vs.    IF(...)
          <conditional-body>                     <conditional-body>
 
+  * ``bool AfterNot`` If ``true``, put a space between alternative operator ``not`` and the
+    opening parenthesis.
+
+    .. code-block:: c++
+
+       true:                                  false:
+       return not (a || b);            vs.    return not(a || b);
+
   * ``bool AfterOverloadedOperator`` If ``true``, put a space between operator overloading and opening
     parentheses.
 
diff --git a/clang/docs/PointerAuthentication.rst b/clang/docs/PointerAuthentication.rst
index 913291c95444..96eb498bc48b 100644
--- a/clang/docs/PointerAuthentication.rst
+++ b/clang/docs/PointerAuthentication.rst
@@ -47,16 +47,16 @@ This document serves four purposes:
 - It documents several language extensions that are useful on targets using
   pointer authentication.
 
-- It will eventually present a theory of operation for the security mitigation,
-  describing the basic requirements for correctness, various weaknesses in the
-  mechanism, and ways in which programmers can strengthen its protections
-  (including recommendations for language implementors).
+- It presents a theory of operation for the security mitigation, describing the
+  basic requirements for correctness, various weaknesses in the mechanism, and
+  ways in which programmers can strengthen its protections (including
+  recommendations for language implementors).
 
-- It will eventually document the language ABIs currently used for C, C++,
-  Objective-C, and Swift on arm64e, although these are not yet stable on any
-  target.
+- It documents the stable ABI of the C, C++, and Objective-C languages on arm64e
+  platforms.
 
-Basic Concepts
+
+Basic concepts
 --------------
 
 The simple address of an object or function is a **raw pointer**.  A raw
@@ -125,7 +125,7 @@ independently for I and D keys.)
   interfaces or as primitives in a compiler IR because they expose raw
   pointers.  Raw pointers require special attention in the language
   implementation to avoid the accidental creation of exploitable code
-  sequences.
+  sequences; see the section on `Attackable code sequences`_.
 
 The following details are all implementation-defined:
 
@@ -167,10 +167,15 @@ a cryptographic signature, other implementations may be possible.  See
     signing key, and stores it in the high bits as the signature. ``auth``
     removes the signature, computes the same hash, and compares the result with
     the stored signature.  ``strip`` removes the signature without
-    authenticating it.  While ``aut*`` instructions do not themselves trap on
-    failure in Armv8.3 PAuth, they do with the later optional FPAC extension.
-    An implementation can also choose to emulate this trapping behavior by
-    emitting additional instructions around ``aut*``.
+    authenticating it.  The ``aut`` instructions in the baseline Armv8.3 PAuth
+    feature do not guarantee to trap on authentication failure; instead, they
+    simply corrupt the pointer so that later uses will likely trap. Unless the
+    "later use" follows immediately and cannot be recovered from (e.g. with a
+    signal handler), this does not provide adequate protection against
+    `authentication oracles`_, so implementations must emit additional
+    instructions to force an immediate trap. This is unnecessary if the
+    processor provides the optional ``FPAC`` extension, which guarantees an
+    immediate trap.
 
   - ``sign_generic`` corresponds to the ``pacga`` instruction, which takes two
     64-bit values and produces a 64-bit cryptographic hash. Implementations of
@@ -234,7 +239,7 @@ implementation-defined.
 
 .. _Signing schemas:
 
-Signing Schemas
+Signing schemas
 ~~~~~~~~~~~~~~~
 
 Correct use of pointer authentication requires the signing code and the
@@ -255,33 +260,172 @@ signing schema breaks down even more simply:
 It is important that the signing schema be independently derived at all signing
 and authentication sites.  Preferably, the schema should be hard-coded
 everywhere it is needed, but at the very least, it must not be derived by
-inspecting information stored along with the pointer.
+inspecting information stored along with the pointer.  See the section on
+`Attacks on pointer authentication`_ for more information.
 
-Language Features
------------------
 
-There is currently one main pointer authentication language feature:
+Language features
+-----------------
 
-- The language provides the ``<ptrauth.h>`` intrinsic interface for manually
-  signing and authenticating pointers in code.  These can be used in
+There are three levels of the pointer authentication language feature:
+
+- The language implementation automatically signs and authenticates function
+  pointers (and certain data pointers) across a variety of standard situations,
+  including return addresses, function pointers, and C++ virtual functions. The
+  intent is for all pointers to code in program memory to be signed in some way
+  and for all branches to code in program text to authenticate those
+  signatures. In addition to the code pointers themselves, we also use pointer
+  authentication to protect data values that directly or indirectly influence
+  control flow or program integrity, or can provide attackers with some other
+  powerful program compromise.
+
+- The language also provides extensions to override the default rules used by
+  the language implementation.  For example, the ``__ptrauth`` type qualifier
+  can be used to change how pointers or pointer sized integers are signed when
+  they are stored in a particular variable or field; this provides much stronger
+  protection than is guaranteed by the default rules for C function and data
+  pointers.
+
+- Finally, the language provides the ``<ptrauth.h>`` intrinsic interface for
+  manually signing and authenticating pointers in code.  These can be used in
   circumstances where very specific behavior is required.
 
+Language implementation
+~~~~~~~~~~~~~~~~~~~~~~~
+
+For the most part, pointer authentication is an unobserved detail of the
+implementation of the programming language.  Any element of the language
+implementation that would perform an indirect branch to a pointer is implicitly
+altered so that the pointer is signed when first constructed and authenticated
+when the branch is performed.  This includes:
+
+- indirect-call features in the programming language, such as C function
+  pointers, C++ virtual functions, C++ member function pointers, the "blocks"
+  C extension, and so on;
+
+- returning from a function, no matter how it is called; and
+
+- indirect calls introduced by the implementation, such as branches through the
+  global offset table (GOT) used to implement direct calls to functions defined
+  outside of the current shared object.
+
+For more information about this, see the `Language ABI`_ section.
+
+However, some aspects of the implementation are observable by the programmer or
+otherwise require special notice.
+
+C data pointers
+^^^^^^^^^^^^^^^
+
+The current implementation in Clang does not sign pointers to ordinary data by
+default. For a partial explanation of the reasoning behind this, see the
+`Theory of Operation`_ section.
+
+A specific data pointer which is more security-sensitive than most can be
+signed using the `__ptrauth qualifier`_ or using the ``<ptrauth.h>``
+intrinsics.
+
+C function pointers
+^^^^^^^^^^^^^^^^^^^
+
+The C standard imposes restrictions on the representation and semantics of
+function pointer types which make it difficult to achieve satisfactory
+signature diversity in the default language rules.  See `Attacks on pointer
+authentication`_ for more information about signature diversity.  Programmers
+should strongly consider using the ``__ptrauth`` qualifier to improve the
+protections for important function pointers, such as the components of of
+a hand-rolled "v-table"; see the section on the `__ptrauth qualifier`_ for
+details.
+
+The value of a pointer to a C function includes a signature, even when the
+value is cast to a non-function-pointer type like ``void*`` or ``intptr_t``. On
+implementations that use high bits to store the signature, this means that
+relational comparisons and hashes will vary according to the exact signature
+value, which is likely to change between executions of a program.  In some
+implementations, it may also vary based on the exact function pointer type.
+
+Null pointers
+^^^^^^^^^^^^^
+
+In principle, an implementation could derive the signed null pointer value
+simply by applying the standard signing algorithm to the raw null pointer
+value. However, for likely signing algorithms, this would mean that the signed
+null pointer value would no longer be statically known, which would have many
+negative consequences.  For one, it would become substantially more expensive
+to emit null pointer values or to perform null-pointer checks.  For another,
+the pervasive (even if technically unportable) assumption that null pointers
+are bitwise zero would be invalidated, making it substantially more difficult
+to adopt pointer authentication, as well as weakening common optimizations for
+zero-initialized memory such as the use of ``.bzz`` sections.  Therefore it is
+beneficial to treat null pointers specially by giving them their usual
+representation.  On AArch64, this requires additional code when working with
+possibly-null pointers, such as when copying a pointer field that has been
+signed with address diversity.
+
+While this representation of nulls is the safest option for the general case,
+there are some situations in which a null pointer may have important semantic
+or security impact. For that purpose Clang has the concept of a pointer
+authentication schema that signs and authenticates null values.
+
+Return addresses
+^^^^^^^^^^^^^^^^
+
+The current implementation in Clang implicitly signs the return addresses in
+function calls.  While the value of the return address is technically an
+implementation detail of a function, there are some important libraries and
+development tools which rely on manually walking the chain of stack frames.
+These tools must be updated to correctly account for pointer authentication,
+either by stripping signatures (if security is not important for the tool, e.g.
+if it is capturing a stack trace during a crash) or properly authenticating
+them.  More information about how these values are signed is available in the
+`Language ABI`_ section.
+
+C++ virtual functions
+^^^^^^^^^^^^^^^^^^^^^
+
+The current implementation in Clang signs virtual function pointers with
+a discriminator derived from the full signature of the overridden method,
+including the method name and parameter types.  It is possible to write C++
+code that relies on v-table layout remaining constant despite changes to
+a method signature; for example, a parameter might be a ``typedef`` that
+resolves to a different type based on a build setting.  Such code violates
+C++'s One Definition Rule (ODR), but that violation is not normally detected;
+however, pointer authentication will detect it.
 
-Language Extensions
+Language extensions
 ~~~~~~~~~~~~~~~~~~~
 
-Feature Testing
+Feature testing
 ^^^^^^^^^^^^^^^
 
 Whether the current target uses pointer authentication can be tested for with
 a number of different tests.
 
-- ``__has_feature(ptrauth_intrinsics)`` is true if ``<ptrauth.h>`` provides its
-  normal interface.  This may be true even on targets where pointer
-  authentication is not enabled by default.
+- ``__PTRAUTH__`` macro is defined if ``<ptrauth.h>`` provides its normal
+  interface. This implies support for the pointer authentication intrinsics
+  and the ``__ptrauth`` qualifier.
 
-__ptrauth Qualifier
-^^^^^^^^^^^^^^^^^^^
+- ``__has_feature(ptrauth_returns)`` is true if the target uses pointer
+  authentication to protect return addresses.
+
+- ``__has_feature(ptrauth_calls)`` is true if the target uses pointer
+  authentication to protect indirect branches.  On arm64e this implies
+  ``__has_feature(ptrauth_returns)``, ``__has_feature(ptrauth_intrinsics)``,
+  and the ``__PTRAUTH__`` macro.
+
+- For backwards compatibility purposes ``__has_feature(ptrauth_intrinsics)``
+  and ``__has_feature(ptrauth_qualifier)`` are available on arm64e targets.
+  These features are synonymous with each other, and are equivalent to testing
+  for the ``__PTRAUTH__`` macro definition. Use of these features should be
+  restricted to cases where backwards compatibility is required, and should be
+  paired with ``defined(__PTRAUTH__)``.
+
+
+Clang provides several other tests only for historical purposes; for current
+purposes they are all equivalent to ``ptrauth_calls``.
+
+``__ptrauth`` qualifier
+^^^^^^^^^^^^^^^^^^^^^^^
 
 ``__ptrauth(key, address, discriminator)`` is an extended type
 qualifier which causes so-qualified objects to hold pointers or pointer sized
@@ -293,6 +437,11 @@ type, either to a function or to an object, or a pointer sized integer.  It
 currently cannot be an Objective-C pointer type, a C++ reference type, or a
 block pointer type; these restrictions may be lifted in the future.
 
+The current implementation in Clang is known to not provide adequate safety
+guarantees against the creation of `signing oracles`_ when assigning data
+pointers to ``__ptrauth``-qualified gl-values.  See the section on `safe
+derivation`_ for more information.
+
 The qualifier's operands are as follows:
 
 - ``key`` - an expression evaluating to a key value from ``<ptrauth.h>``; must
@@ -327,6 +476,57 @@ a discriminator determined as follows:
   is ``ptrauth_blend_discriminator(&x, discriminator)``; see
   `ptrauth_blend_discriminator`_.
 
+Non-triviality from address diversity
++++++++++++++++++++++++++++++++++++++
+
+Address diversity must impose additional restrictions in order to allow the
+implementation to correctly copy values.  In C++, a type qualified with address
+diversity is treated like a class type with non-trivial copy/move constructors
+and assignment operators, with the usual effect on containing classes and
+unions.  C does not have a standard concept of non-triviality, and so we must
+describe the basic rules here, with the intention of imitating the emergent
+rules of C++:
+
+- A type may be **non-trivial to copy**.
+
+- A type may also be **illegal to copy**. Types that are illegal to copy are
+  always non-trivial to copy.
+
+- A type may also be **address-sensitive**. This includes types that use self
+  referencing pointers, data protected by address diversified pointer
+  authentication, or other similar concepts.
+
+- A type qualified with a ``ptrauth`` qualifier or implicit authentication
+  schema that requires address diversity is non-trivial to copy and
+  address-sensitive.
+
+- An array type is illegal to copy, non-trivial to copy, or address-sensitive
+  if its element type is illegal to copy, non-trivial to copy, or
+  address-sensitive, respectively.
+
+- A struct type is illegal to copy, non-trivial to copy, or address-sensitive
+  if it has a field whose type is illegal to copy, non-trivial to copy, or
+  address-sensitive, respectively.
+
+- A union type is both illegal and non-trivial to copy if it has a field whose
+  type is non-trivial or illegal to copy.
+
+- A union type is address-sensitive if it has a field whose type is
+  address-sensitive.
+
+- A program is ill-formed if it uses a type that is illegal to copy as
+  a function parameter, argument, or return type.
+
+- A program is ill-formed if an expression requires a type to be copied that is
+  illegal to copy.
+
+- Otherwise, copying a type that is non-trivial to copy correctly copies its
+  subobjects.
+
+- Types that are address-sensitive must always be passed and returned
+  indirectly. Thus, changing the address-sensitivity of a type may be
+  ABI-breaking even if its size and alignment do not change.
+
 ``<ptrauth.h>``
 ~~~~~~~~~~~~~~~
 
@@ -433,7 +633,7 @@ Produce a signed pointer for the given raw pointer without applying any
 authentication or extra treatment.  This operation is not required to have the
 same behavior on a null pointer that the language implementation would.
 
-This is a treacherous operation that can easily result in signing oracles.
+This is a treacherous operation that can easily result in `signing oracles`_.
 Programs should use it seldom and carefully.
 
 ``ptrauth_auth_and_resign``
@@ -454,7 +654,29 @@ a null pointer that the language implementation would.
 The code sequence produced for this operation must not be directly attackable.
 However, if the discriminator values are not constant integers, their
 computations may still be attackable.  In the future, Clang should be enhanced
-to guaranteed non-attackability if these expressions are safely-derived.
+to guaranteed non-attackability if these expressions are
+:ref:`safely-derived<Safe derivation>`.
+
+``ptrauth_auth_function``
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c
+
+  ptrauth_auth_function(pointer, key, discriminator)
+
+Authenticate that ``pointer`` is signed with ``key`` and ``discriminator`` and
+re-sign it to the standard schema for a function pointer of its type.
+
+``pointer`` must have function pointer type.  The result will have the same
+type as ``pointer``.  This operation is not required to have the same behavior
+on a null pointer that the language implementation would.
+
+This operation makes the same attackability guarantees as
+``ptrauth_auth_and_resign``.
+
+If this operation appears syntactically as the function operand of a call,
+Clang guarantees that the call will directly authenticate the function value
+using the given schema rather than re-signing to the standard schema.
 
 ``ptrauth_auth_data``
 ^^^^^^^^^^^^^^^^^^^^^
@@ -500,12 +722,921 @@ type.  Implementations are not required to make all bits of the result equally
 significant; in particular, some implementations are known to not leave
 meaningful data in the low bits.
 
+Standard ``__ptrauth`` qualifiers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``<ptrauth.h>`` additionally provides several macros which expand to
+``__ptrauth`` qualifiers for common ABI situations.
+
+For convenience, these macros expand to nothing when pointer authentication is
+disabled.
+
+These macros can be found in the header; some details of these macros may be
+unstable or implementation-specific.
+
+
+Theory of operation
+-------------------
+
+The threat model of pointer authentication is as follows:
+
+- The attacker has the ability to read and write to a certain range of
+  addresses, possibly the entire address space.  However, they are constrained
+  by the normal rules of the process: for example, they cannot write to memory
+  that is mapped read-only, and if they access unmapped memory it will trigger
+  a trap.
+
+- The attacker has no ability to add arbitrary executable code to the program.
+  For example, the program does not include malicious code to begin with, and
+  the attacker cannot alter existing instructions, load a malicious shared
+  library, or remap writable pages as executable.  If the attacker wants to get
+  the process to perform a specific sequence of actions, they must somehow
+  subvert the normal control flow of the process.
+
+In both of the above paragraphs, it is merely assumed that the attacker's
+*current* capabilities are restricted; that is, their current exploit does not
+directly give them the power to do these things.  The attacker's immediate goal
+may well be to leverage their exploit to gain these capabilities, e.g. to load
+a malicious dynamic library into the process, even though the process does not
+directly contain code to do so.
+
+Note that any bug that fits the above threat model can be immediately exploited
+as a denial-of-service attack by simply performing an illegal access and
+crashing the program.  Pointer authentication cannot protect against this.
+While denial-of-service attacks are unfortunate, they are also unquestionably
+the best possible result of a bug this severe. Therefore, pointer authentication
+enthusiastically embraces the idea of halting the program on a pointer
+authentication failure rather than continuing in a possibly-compromised state.
+
+Pointer authentication is a form of control-flow integrity (CFI) enforcement.
+The basic security hypothesis behind CFI enforcement is that many bugs can only
+be usefully exploited (other than as a denial-of-service) by leveraging them to
+subvert the control flow of the program.  If this is true, then by inhibiting or
+limiting that subversion, it may be possible to largely mitigate the security
+consequences of those bugs by rendering them impractical (or, ideally,
+impossible) to exploit.
+
+Every indirect branch in a program has a purpose.  Using human intelligence, a
+programmer can describe where a particular branch *should* go according to this
+purpose: a ``return`` in ``printf`` should return to the call site, a particular
+call in ``qsort`` should call the comparator that was passed in as an argument,
+and so on.  But for CFI to enforce that every branch in a program goes where it
+*should* in this sense would require CFI to perfectly enforce every semantic
+rule of the program's abstract machine; that is, it would require making the
+programming environment perfectly sound.  That is out of scope.  Instead, the
+goal of CFI is merely to catch attempts to make a branch go somewhere that its
+obviously *shouldn't* for its purpose: for example, to stop a call from
+branching into the middle of a function rather than its beginning.  As the
+information available to CFI gets better about the purpose of the branch, CFI
+can enforce tighter and tighter restrictions on where the branch is permitted to
+go.  Still, ultimately CFI cannot make the program sound.  This may help explain
+why pointer authentication makes some of the choices it does: for example, to
+sign and authenticate mostly code pointers rather than every pointer in the
+program.  Preventing attackers from redirecting branches is both particularly
+important and particularly approachable as a goal.  Detecting corruption more
+broadly is infeasible with these techniques, and the attempt would have far
+higher cost.
+
+Attacks on pointer authentication
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Pointer authentication works as follows.  Every indirect branch in a program has
+a purpose.  For every purpose, the implementation chooses a
+:ref:`signing schema<Signing schemas>`.  At some place where a pointer is known
+to be correct for its purpose, it is signed according to the purpose's schema.
+At every place where the pointer is needed for its purpose, it is authenticated
+according to the purpose's schema.  If that authentication fails, the program is
+halted.
+
+There are a variety of ways to attack this.
+
+Attacks of interest to programmers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These attacks arise from weaknesses in the default protections offered by
+pointer authentication.  They can be addressed by using attributes or intrinsics
+to opt in to stronger protection.
+
+Substitution attacks
+++++++++++++++++++++
+
+An attacker can simply overwrite a pointer intended for one purpose with a
+pointer intended for another purpose if both purposes use the same signing
+schema and that schema does not use address diversity.
+
+The most common source of this weakness is when code relies on using the default
+language rules for C function pointers.  The current implementation uses the
+exact same signing schema for all C function pointers, even for functions of
+substantially different type.  While efforts are ongoing to improve constant
+diversity for C function pointers of different type, there are necessary limits
+to this.  The C standard requires function pointers to be copyable with
+``memcpy``, which means that function pointers can never use address diversity.
+Furthermore, even if a function pointer can only be replaced with another
+function of the exact same type, that can still be useful to an attacker, as in
+the following example of a hand-rolled "v-table":
+
+.. code-block:: c
+
+  struct ObjectOperations {
+    void (*retain)(Object *);
+    void (*release)(Object *);
+    void (*deallocate)(Object *);
+    void (*logStatus)(Object *);
+  };
+
+The weakness in this design is that by lacking any context specific
+discriminator, this means an attacker can substitute any of these fields with
+any other function pointer signed with the default schema. Similarly the lack of
+address diversity allows an attacker to replace the functions in one type's
+"v-table" with those of another. This can be mitigated by overriding the default
+authentication schema with a more specific signing schema for each purpose.  For
+instance, in this example, the ``__ptrauth`` qualifier can be used with a
+different constant discriminator for each field.  Since there's no particular
+reason it's important for this v-table to be copyable with ``memcpy``, the
+functions can also be signed with address diversity:
+
+.. code-block:: c
+
+  #if defined(__PTRAUTH__)
+  #define objectOperation(discriminator) \
+    __ptrauth(ptrauth_key_function_pointer, 1, discriminator)
+  #else
+  #define objectOperation(discriminator)
+  #endif
+
+  struct ObjectOperations {
+    void (*objectOperation(0xf017) retain)(Object *);
+    void (*objectOperation(0x2639) release)(Object *);
+    void (*objectOperation(0x8bb0) deallocate)(Object *);
+    void (*objectOperation(0xc5d4) logStatus)(Object *);
+  };
+
+This weakness can also sometimes be mitigated by simply keeping the signed
+pointer in constant memory, but this is less effective than using better signing
+diversity.
+
+.. _Access path attacks:
+
+Access path attacks
++++++++++++++++++++
+
+If a signed pointer is often accessed indirectly (that is, by first loading the
+address of the object where the signed pointer is stored), an attacker can
+affect uses of it by overwriting the intermediate pointer in the access path.
+
+The most common scenario exhibiting this weakness is an object with a pointer to
+a "v-table" (a structure holding many function pointers). An attacker does not
+need to replace a signed function pointer in the v-table if they can instead
+simply replace the v-table pointer in the object with their own pointer ---
+perhaps to memory where they've constructed their own v-table, or to existing
+memory that coincidentally happens to contain a signed pointer at the right
+offset that's been signed with the right signing schema.
+
+This attack arises because data pointers are not signed by default. It works
+even if the signed pointer uses address diversity: address diversity merely
+means that each pointer is signed with its own storage address,
+which (by design) is invariant to changes in the accessing pointer.
+
+Using sufficiently diverse signing schemas within the v-table can provide
+reasonably strong mitigation against this weakness.  Always use address and type
+diversity in v-tables to prevent attackers from assembling their own v-table.
+Avoid re-using constant discriminators to prevent attackers from replacing a
+v-table pointer with a pointer to totally unrelated memory that just happens to
+contain an similarly-signed pointer, or reused memory containing a different
+type.
+
+Further mitigation can be attained by signing pointers to v-tables. Any
+signature at all should prevent attackers from forging v-table pointers; they
+will need to somehow harvest an existing signed pointer from elsewhere in
+memory.  Using a meaningful constant discriminator will force this to be
+harvested from an object with similar structure (e.g. a different implementation
+of the same interface).  Using address diversity will prevent such harvesting
+entirely.  However, care must be taken when sourcing the v-table pointer
+originally; do not blindly sign a pointer that is not
+:ref:`safely derived<Safe derivation>`.
+
+.. _Signing oracles:
+
+Signing oracles
++++++++++++++++
+
+A signing oracle is a bit of code which can be exploited by an attacker to sign
+an arbitrary pointer in a way that can later be recovered.  Such oracles can be
+used by attackers to forge signatures matching the oracle's signing schema,
+which is likely to cause a total compromise of pointer authentication's
+effectiveness.
+
+This attack only affects ordinary programmers if they are using certain
+treacherous patterns of code.  Currently this includes:
+
+- all uses of the ``__ptrauth_sign_unauthenticated`` intrinsic and
+- assigning values to ``__ptrauth``-qualified l-values.
+
+Care must be taken in these situations to ensure that the pointer being signed
+has been :ref:`safely derived<Safe derivation>` or is otherwise not possible to
+attack.  (In some cases, this may be challenging without compiler support.)
+
+A diagnostic will be added in the future for implicitly dangerous patterns of
+code, such as assigning a non-safely-derived values to a
+``__ptrauth``-qualified l-value.
+
+.. _Authentication oracles:
+
+Authentication oracles
+++++++++++++++++++++++
+
+An authentication oracle is a bit of code which can be exploited by an attacker
+to leak whether a signed pointer is validly signed without halting the program
+if it isn't.  Such oracles can be used to forge signatures matching the oracle's
+signing schema if the attacker can repeatedly invoke the oracle for different
+candidate signed pointers. This is likely to cause a total compromise of pointer
+authentication's effectiveness.
+
+There should be no way for an ordinary programmer to create an authentication
+oracle using the current set of operations. However, implementation flaws in the
+past have occasionally given rise to authentication oracles due to a failure to
+immediately trap on authentication failure.
+
+The likelihood of creating an authentication oracle is why there is currently no
+intrinsic which queries whether a signed pointer is validly signed.
+
+
+Attacks of interest to implementors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+These attacks are not inherent to the model; they arise from mistakes in either
+implementing or using the `sign` and `auth` operations. Avoiding these mistakes
+requires careful work throughout the system.
+
+Failure to trap on authentication failure
++++++++++++++++++++++++++++++++++++++++++
+
+Any failure to halt the program on an authentication failure is likely to be
+exploitable by attackers to create an
+:ref:`authentication oracle<Authentication oracles>`.
+
+There are several different ways to introduce this problem:
+
+- The implementation might try to halt the program in some way that can be
+  intercepted.
+
+  For example, the Armv8.3 ``aut`` instructions do not directly trap on
+  authentication failure on processors that lack the ``FPAC`` extension.
+  Instead, they corrupt their results to be invalid pointers, with the idea that
+  subsequent uses of those pointers will trigger traps as bad memory accesses.
+  However, most kernels do not immediately halt programs that trap due to bad
+  memory accesses; instead, they notify the process to give it an opportunity to
+  recover. If this happens with an ``auth`` failure, the attacker may be able to
+  exploit the recovery path in a way that creates an oracle. Kernels must
+  provide a way for a process to trap unrecoverably, and this should cover all
+  ``FPAC`` traps. Compilers must ensure that ``auth`` failures trigger an
+  unrecoverable trap, ideally by taking advantage of ``FPAC``, but if necessary
+  by emitting extra instructions.
+
+- A compiler might use an intermediate representation (IR) for ``sign`` and
+  ``auth`` operations that cannot make adequate correctness guarantees.
+
+  For example, suppose that an IR uses ARMv8.3-like semantics for ``auth``: the
+  operation merely corrupts its result on failure instead of promising to trap.
+  A frontend might emit patterns of IR that always follow an ``auth`` with a
+  memory access, thinking that this ensures correctness. But if the IR can be
+  transformed to insert code between the ``auth`` and the access, or if the
+  ``auth`` can be speculated, then this potentially creates an oracle.  It is
+  better for ``auth`` to semantically guarantee to trap, potentially requiring
+  an explicit check in the generated code. An ARMv8.3-like target can avoid this
+  explicit check in the common case by recognizing the pattern of an ``auth``
+  followed immediately by an access.
+
+Attackable code sequences
++++++++++++++++++++++++++
+
+If code that is part of a pointer authentication operation is interleaved with
+code that may itself be vulnerable to attacks, an attacker may be able to use
+this to create a :ref:`signing<Signing oracles>` or
+:ref:`authentication<Authentication oracles>` oracle.
+
+For example, suppose that the compiler is generating a call to a function and
+passing two arguments: a signed constant pointer and a value derived from a
+call.  In ARMv8.3, this code might look like so:
+
+.. code-block:: asm
+
+  adr x19, _callback.        ; compute &_callback
+  paciza x19                 ; sign it with a constant discriminator of 0
+  blr _argGenerator          ; call _argGenerator() (returns in x0)
+  mov x1, x0                 ; move call result to second arg register
+  mov x0, x19                ; move signed &_callback to first arg register
+  blr _function              ; call _function
+
+This code is correct, as would be a sequencing that does *both* the ``adr`` and
+the ``paciza`` after the call to ``_argGenerator``.  But a sequence that
+computes the address of ``_callback`` but leaves it as a raw pointer in a
+register during the call to ``_argGenerator`` would be vulnerable:
+
+.. code-block:: asm
+
+  adr x19, _callback.        ; compute &_callback
+  blr _argGenerator          ; call _argGenerator() (returns in x0)
+  mov x1, x0                 ; move call result to second arg register
+  paciza x19                 ; sign &_callback
+  mov x0, x19                ; move signed &_callback to first arg register
+  blr _function              ; call _function
+
+If ``_argGenerator`` spills ``x19`` (a callee-save register), and if the
+attacker can perform a write during this call, then the attacker can overwrite
+the spill slot with an arbitrary pointer that will eventually be unconditionally
+signed after the function returns.  This would be a signing oracle.
+
+The implementation can avoid this by obeying two basic rules:
+
+- The compiler's intermediate representations (IR) should not provide operations
+  that expose intermediate raw pointers.  This may require providing extra
+  operations that perform useful combinations of operations.
+
+  For example, there should be an "atomic" auth-and-resign operation that should
+  be used instead of emitting an ``auth`` operation whose result is fed into a
+  ``sign``.
+
+  Similarly, if a pointer should be authenticated as part of doing a memory
+  access or a call, then the access or call should be decorated with enough
+  information to perform the authentication; there should not be a separate
+  ``auth`` whose result is used as the pointer operand for the access or call.
+  (In LLVM IR, we do this for calls, but not yet for loads or stores.)
+
+  "Operations" includes things like materializing a signed value to a known
+  function or global variable.  The compiler must be able to recognize and emit
+  this as a unified operation, rather than potentially splitting it up as in
+  the example above.
+
+- The compiler backend should not be too aggressive about scheduling
+  instructions that are part of a pointer authentication operation. This may
+  require custom code-generation of these operations in some cases.
+
+Register clobbering
++++++++++++++++++++
+
+As a refinement of the section on `Attackable code sequences`_, if the attacker
+has the ability to modify arbitrary *register* state at arbitrary points in the
+program, then special care must be taken.
+
+For example, ARMv8.3 might materialize a signed function pointer like so:
+
+.. code-block:: asm
+
+  adr x0, _callback.        ; compute &_callback
+  paciza x0                 ; sign it with a constant discriminator of 0
+
+If an attacker has the ability to overwrite ``x0`` between these two
+instructions, this code sequence is vulnerable to becoming a signing oracle.
+
+For the most part, this sort of attack is not possible: it is a basic element of
+the design of modern computation that register state is private and inviolable.
+However, in systems that support asynchronous interrupts, this property requires
+the cooperation of the interrupt-handling code. If that code saves register
+state to memory, and that memory can be overwritten by an attacker, then
+essentially the attack can overwrite arbitrary register state at an arbitrary
+point.  This could be a concern if the threat model includes attacks on the
+kernel or if the program uses user-space preemptive multitasking.
+
+(Readers might object that an attacker cannot rely on asynchronous interrupts
+triggering at an exact instruction boundary.  In fact, researchers have had some
+success in doing exactly that.  Even ignoring that, though, we should aim to
+protect against lucky attackers just as much as good ones.)
+
+To protect against this, saved register state must be at least partially signed
+(using something like `ptrauth_sign_generic_data`_).  This is required for
+correctness anyway because saved thread states include security-critical
+registers such as SP, FP, PC, and LR (where applicable).  Ideally, this
+signature would cover all the registers, but since saving and restoring
+registers can be very performance-sensitive, that may not be acceptable. It is
+sufficient to set aside a small number of scratch registers that will be
+guaranteed to be preserved correctly; the compiler can then be careful to only
+store critical values like intermediate raw pointers in those registers.
+
+``setjmp`` and ``longjmp`` should sign and authenticate the core registers (SP,
+FP, PC, and LR), but they do not need to worry about intermediate values because
+``setjmp`` can only be called synchronously, and the compiler should never
+schedule pointer-authentication operations interleaved with arbitrary calls.
+
+.. _Relative addresses:
+
+Attacks on relative addressing
+++++++++++++++++++++++++++++++
+
+Relative addressing is a technique used to compress and reduce the load-time
+cost of infrequently-used global data.  The pointer authentication system is
+unlikely to support signing or authenticating a relative address, and in most
+cases it would defeat the point to do so: it would take additional storage
+space, and applying the signature would take extra work at load time.
+
+Relative addressing is not precluded by the use of pointer authentication, but
+it does take extra considerations to make it secure:
+
+- Relative addresses must only be stored in read-only memory.  A writable
+  relative address can be overwritten to point nearly anywhere, making it
+  inherently insecure; this danger can only be compensated for with techniques
+  for protecting arbitrary data like `ptrauth_sign_generic_data`_.
+
+- Relative addresses must only be accessed through signed pointers with adequate
+  diversity.  If an attacker can perform an `access path attack` to replace the
+  pointer through which the relative address is accessed, they can easily cause
+  the relative address to point wherever they want.
+
+Signature forging
++++++++++++++++++
+
+If an attacker can exactly reproduce the behavior of the signing algorithm, and
+they know all the correct inputs to it, then they can perfectly forge a
+signature on an arbitrary pointer.
+
+There are three components to avoiding this mistake:
+
+- The abstract signing algorithm should be good: it should not have glaring
+  flaws which would allow attackers to predict its result with better than
+  random accuracy without knowing all the inputs (like the key values).
+
+- The key values should be kept secret.  If at all possible, they should never
+  be stored in accessible memory, or perhaps only stored encrypted.
+
+- Contexts that are meant to be independently protected should use different
+  key values.  For example, the kernel should not use the same keys as user
+  processes.  Different user processes should also use different keys from each
+  other as much as possible, although this may pose its own technical
+  challenges.
+
+Remapping
++++++++++
+
+If an attacker can change the memory protections on certain pages of the
+program's memory, that can substantially weaken the protections afforded by
+pointer authentication.
+
+- If an attacker can inject their own executable code, they can also certainly
+  inject code that can be used as a :ref:`signing oracle<Signing Oracles>`.
+  The same is true if they can write to the instruction stream.
+
+- If an attacker can remap read-only program data sections to be writable, then
+  any use of :ref:`relative addresses` in global data becomes insecure.
+
+- On platforms that use them, if an attacker can remap the memory containing
+  the `global offset tables`_ as writable, then any unsigned pointers in those
+  tables are insecure.
+
+Remapping memory in this way often requires the attacker to have already
+substantively subverted the control flow of the process.  Nonetheless, if the
+operating system has a mechanism for mapping pages in a way that cannot be
+remapped, this should be used wherever possible.
+
+.. _Safe Derivation:
+
+Safe derivation
+~~~~~~~~~~~~~~~
+
+Whether a data pointer is stored, even briefly, as a raw pointer can affect the
+security-correctness of a program.  (Function pointers are never implicitly
+stored as raw pointers; raw pointers to functions can only be produced with the
+``<ptrauth.h>`` intrinsics.)  Repeated re-signing can also impact performance.
+Clang makes a modest set of guarantees in this area:
+
+- An expression of pointer type is said to be **safely derived** if:
+
+  - it takes the address of a global variable or function, or
+
+  - it is a load from a gl-value of ``__ptrauth``-qualified type, or
+
+  - it is a load from read-only memory that has been initialized from a safely
+    derived source, such as the `data const` section of a binary or library.
+
+- If a value that is safely derived is assigned to a ``__ptrauth``-qualified
+  object, including by initialization, then the value will be directly signed as
+  appropriate for the target qualifier and will not be stored as a raw pointer.
+
+- If the function expression of a call is a gl-value of ``__ptrauth``-qualified
+  type, then the call will be authenticated directly according to the source
+  qualifier and will not be resigned to the default rule for a function pointer
+  of its type.
+
+These guarantees are known to be inadequate for data pointer security. In
+particular, Clang should be enhanced to make the following guarantees:
+
+- A pointer should additionally be considered safely derived if it is:
+
+  - the address of a gl-value that is safely derived,
+
+  - the result of pointer arithmetic on a pointer that is safely derived (with
+    some restrictions on the integer operand),
+
+  - the result of a comma operator where the second operand is safely derived,
+
+  - the result of a conditional operator where the selected operand is safely
+    derived, or
+
+  - the result of loading from a safely derived gl-value.
+
+- A gl-value should be considered safely derived if it is:
+
+  - a dereference of a safely derived pointer,
+
+  - a member access into a safely derived gl-value, or
+
+  - a reference to a variable.
+
+- An access to a safely derived gl-value should be guaranteed to not allow
+  replacement of any of the safely-derived component values at any point in the
+  access.  "Access" should include loading a function pointer.
+
+- Assignments should include pointer-arithmetic operators like ``+=``.
+
+Making these guarantees will require further work, including significant new
+support in LLVM IR.
+
+Furthermore, Clang should implement a warning when assigning a data pointer that
+is not safely derived to a ``__ptrauth``-qualified gl-value.
+
+
+Language ABI
+------------
+
+This section describes the pointer-authentication ABI currently implemented in
+Clang for the Apple arm64e target.  As other targets adopt pointer
+authentication, this section should be generalized to express their ABIs as
+well.
+
+Key assignments
+~~~~~~~~~~~~~~~
+
+ARMv8.3 provides four abstract signing keys: ``IA``, ``IB``, ``DA``, and ``DB``.
+The architecture designates ``IA`` and ``IB`` for signing code pointers and
+``DA`` and ``DB`` for signing data pointers; this is reinforced by two
+properties:
+
+- The ISA provides instructions that perform combined auth+call and auth+load
+  operations; these instructions can only use the ``I`` keys and ``D`` keys,
+  respectively.
+
+- AArch64's TBI feature can be separately enabled for code pointers (controlling
+  whether indirect-branch instructions ignore those bits) and data pointers
+  (controlling whether memory-access instructions) ignore those bits. If TBI is
+  enabled for a kind of pointer, the sign and auth operations preserve the TBI
+  bits when signing with an associated keys (at the cost of shrinking the number
+  of available signing bits by 8).
+
+arm64e then further subdivides the keys as follows:
+
+- The ``A`` keys are used for primarily "global" purposes like signing v-tables
+  and function pointers.  These keys are sometimes called *process-independent*
+  or *cross-process* because on existing OSes they are not changed when changing
+  processes, although this is not a platform guarantee.
+
+- The ``B`` keys are used for primarily "local" purposes like signing return
+  addresses.  These keys are sometimes called *process-specific* because they
+  are typically different between processes. However, they are in fact shared
+  across processes in one situation: systems which provide ``fork`` cannot
+  change these keys in the child process; they can only be changed during
+  ``exec``.
+
+Implementation-defined algorithms and quantities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The cryptographic hash algorithm used to compute signatures in ARMv8.3 is a
+private detail of the hardware implementation.
+
+arm64e restricts constant discriminators (used in ``__ptrauth`` and
+``ptrauth_blend_discriminator``) to the range from 0 to 65535, inclusive.  A 0
+discriminator generally signifies that no blending is required; see the
+documentation for ``ptrauth_blend_discriminator``.  This range is somewhat
+narrow but has two advantages:
+
+- The AArch64 ISA allows an arbitrary 16-bit immediate to be written over the
+  top 16 bits of a register in a single instruction:
+
+  .. code-block:: asm
+
+    movk xN, #0x4849, LSL 48
+
+  This is ideal for the discriminator blending operation because it adds minimal
+  code-size overhead and avoids overwriting any interesting bits from the
+  pointer.  Blending in a wider constant discriminator would either clobber
+  interesting bits (e.g. if it was loaded with ``movk xN, #0x4c4f, LSL 32``) or
+  require significantly more code (e.g. if the discriminator was loaded with a
+  ``mov+bfi`` sequence).
+
+- It is possible to pack a 16-bit discriminator into loader metadata with
+  minimal compromises, whereas a wider discriminator would require extra
+  metadata storage and therefore significantly impact load times.
+
+The string hash used by ``ptrauth_string_discriminator`` is a 64-bit SipHash-2-4
+using the constant seed ``b5d4c9eb79104a796fec8b1b428781d4`` (big-endian), with
+the result reduced by modulo to the range of non-zero discriminators (i.e.
+``(rawHash % 65535) + 1``).
+
+Return addresses
+~~~~~~~~~~~~~~~~
+
+The kernel must ensure that attackers cannot replace LR due to an asynchronous
+exception; see `Register clobbering`_.  If this is done by generally protecting
+LR, then functions which don't spill LR to the stack can avoid signing it
+entirely.  Otherwise, the return address must be signed; on arm64e it is signed
+with the ``IB`` key using the stack pointer on entry as the discriminator.
+
+Protecting return addresses is of such particular importance that the ``IB`` key
+is almost entirely reserved for this purpose.
+
+Global offset tables
+~~~~~~~~~~~~~~~~~~~~
+
+The global offset table (GOT) is not part of the language ABI, but it is a
+common implementation technique for dynamic linking which deserves special
+discussion here.
+
+Whenever possible, signed pointers should be materialized directly in code
+rather than via the GOT, e.g. using an ``adrp+add+pac`` sequence on ARMv8.3.
+This decreases the amount of work necessary at load time to initialize the GOT,
+but more importantly, it defines away the potential for several attacks:
+
+- Attackers cannot change instructions, so there is no way to cause this code
+  sequence to materialize a different pointer, whereas an access via the GOT
+  always has *at minimum* a probabilistic chance to be the target of successful
+  `substitution attacks`_.
+
+- The GOT is a dense pool of fixed pointers at a fixed offset relative to code;
+  attackers can search this pool for useful pointers that can be used in
+  `substitution attacks`_, whereas pointers that are only materialized directly
+  are not so easily available.
+
+- Similarly, attackers can use `access path attacks`_ to replace a pointer to a
+  signed pointer with a pointer to the GOT if the signing schema used within the
+  GOT happens to be the same as the original pointer.  This kind of collision
+  becomes much less likely to be useful the fewer pointers are in the GOT in the
+  first place.
+
+If this can be done for a symbol, then the compiler need only ensure that it
+materializes the signed pointer using registers that are safe against
+`register clobbering`_.
+
+However, many symbols can only be accessed via the GOT, e.g. because they
+resolve to definitions outside of the current image.  In this case, care must
+be taken to ensure that using the GOT does not introduce weaknesses.
+
+- If the entire GOT can be mapped read-only after loading, then no signing is
+  required within the GOT.  In fact, not signing pointers in the GOT is
+  preferable in this case because it makes the GOT useless for the harvesting
+  and access-path attacks above.  Storing raw pointers in this way is usually
+  extremely unsafe, but for the special case of an immutable GOT entry it's fine
+  because the GOT is always accessed via an address that is directly
+  materialized in code and thus provably unattackable.  (But see `Remapping`_.)
+
+- Otherwise, GOT entries which are used for producing a signed pointer constant
+  must be signed.  The signing schema used in the GOT need not match the target
+  signing schema for the signed constant.  To counteract the threats of
+  substitution attacks, it's best if GOT entries can be signed with address
+  diversity.  Using a good constant discriminator as well (perhaps derived from
+  the symbol name) can make it less useful to use a pointer to the GOT as the
+  replacement in an :ref:`access path attack<Access path attacks>`.
+
+In either case, the compiler must ensure that materializing the address of a GOT
+entry as part of producing a signed pointer constant is not vulnerable to
+`register clobbering`_.  If the linker also generates code for this, e.g. for
+call stubs, this generated code must take the same precautions.
+
+Dynamic symbol lookup
+~~~~~~~~~~~~~~~~~~~~~
+
+On platforms that support dynamically loading or resolving symbols it is
+necessary for them to define the pointer authentication semantics of the APIs
+provided to perform such lookups. While the platform may choose to reply
+unsigned pointers from such function and rely on the caller performing the
+initial signing, doing so creates the opportunity for caller side errors that
+create :ref:`signing oracles<Signing Oracles>`.
+
+On arm64e the `dlsym` function is used to resolve a symbol at runtime. If the
+resolved symbol is a function or other code pointer the returned pointer is
+signed using the default function signing schema described in
+:ref:`C function pointers<C function abi>`. If the resolved symbol is not a code pointer it is
+returned as an unsigned pointer.
+
+.. _C function abi:
+
+C function pointers
+~~~~~~~~~~~~~~~~~~~
+
+On arm64e, C function pointers are currently signed with the ``IA`` key without
+address diversity and with a constant discriminator of 0.
+
+The C and C++ standards do not permit C function pointers to be signed with
+address diversity by default: in C++ terms, function pointer types are required
+to be trivially copyable, which means they must be copyable with ``memcpy``.
+
+The use of a uniform constant discriminator greatly simplifies the adoption of
+arm64e, but it is a significant weakness in the mitigation because it allows any
+C function pointer to be replaced with another. Clang supports
+`-fptrauth-function-pointer-type-discrimination`, which enables a variant ABI
+that uses type discrimination for function pointers. When generating the type
+based discriminator for a function type all primitive integer types are
+considered equivalent due to the prevalence of mismatching integer parameter
+types in real world code. Type discrimination of function pointers is
+ABI-incompatible with the standard arm64e ABI, but it can be used in constrained
+contexts such as embedded systems or in code that does not require function
+pointer interoperation with the standard ABI (e.g. because it does not pass
+function pointers back and forth, or only does so through
+``__ptrauth``-qualified l-values).
+
+C++ virtual tables
+~~~~~~~~~~~~~~~~~~
+
+By default the pointer to a C++ virtual table is currently signed with the
+``DA`` key, address diversity, and a constant discriminator equal to the string
+hash (see `ptrauth_string_discriminator`_) of the mangled v-table identifier
+of the primary base class for the v-table. To support existing code or ABI
+constraints it is possible to use the `ptrauth_vtable_pointer` attribute to
+override the schema used for the v-table pointer of the base type of
+polymorphic class hierarchy. This attribute permits the configuration of the
+key, address diversity mode, and any extra constant discriminator to be used.
+
+Virtual functions in a C++ virtual table are signed with the ``IA`` key, address
+diversity, and a constant discriminator equal to the string hash (see
+`ptrauth_string_discriminator`_) of the mangled name of the function which
+originally gave rise to the v-table slot.
+
+C++ dynamic_cast
+~~~~~~~~~~~~~~~~
+
+C++'s ``dynamic_cast`` presents a difficulty relative to other polymorphic
+languages that have a
+`top type <https://en.wikipedia.org/wiki/Any_type>` as the use of declaration
+diversity for v-table pointers results in distinct signing schemas for each
+isolated type hierarchy. As a result it is not possible for the Itanium ABI
+defined ``__dynamic_cast`` entry point to directly authenticate the v-table
+pointer of the provided object.
+
+The current implementation uses a forced authentication of the subject object's
+v-table prior to invoking ``__dynamic_cast`` to partially verify that the
+object's vtable is valid. The ``__dynamic_cast`` implementation currently relies
+on this caller side check to limit the substitutability of the v-table pointer
+with an incorrect or invalid v-table. The subsequent implementation of the
+dynamic cast algorithm is built on pointer auth protected ``type_info`` objects.
+
+In future a richer solution may be developed to support vending the correct
+authentication schema directly to the ``dynamic_cast`` implementation.
+
+C++ std::type_info v-table pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The v-table pointer of the ``std::type_info`` type is signed with the ``DA`` key
+and no additional diversity.
+
+C++ member function pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A member function pointer is signed with the ``IA`` key, no address diversity,
+and a constant discriminator equal to the string hash
+(see `ptrauth_string_discriminator`_) of the member pointer type.  Address
+diversity is not permitted by C++ for member function pointers because they must
+be trivially-copyable types.
+
+The Itanium C++ ABI specifies that member function pointers to virtual functions
+simply store an offset to the correct v-table slot.  This ABI cannot be used
+securely with pointer authentication because there is no safe place to store the
+constant discriminator for the target v-table slot: if it's stored with the
+offset, an attacker can simply overwrite it with the right discriminator for the
+offset.  Even if the programmer never uses pointers to virtual functions, the
+existence of this code path makes all member function pointer dereferences
+insecure.
+
+arm64e changes this ABI so that virtual function pointers are stored using
+dispatch thunks with vague linkage.  Because arm64e supports interoperation with
+``arm64`` code when pointer authentication is disabled, an arm64e member
+function pointer dereference still recognizes the virtual-function
+representation but uses an bogus discriminator on that path that should always
+trap if pointer authentication is enabled dynamically.
+
+The use of dispatch thunks means that ``==`` on member function pointers is no
+longer reliable for virtual functions, but this is acceptable because the
+standard makes no guarantees about it in the first place.
+
+The use of dispatch thunks also is required to support declaration specific
+authentication schemas for v-table pointers.
+
+C++ mangling
+~~~~~~~~~~~~
+
+When the ``__ptrauth`` qualifier appears in a C++ mangled name,
+it is mangled as a vendor qualifier with the signature
+``U9__ptrauthILj<key>ELb<addressDiscriminated>ELj<extraDiscriminator>EE``.
+
+e.g. ``int * __ptrauth(1, 0, 1234)`` will be mangled as
+``U9__ptrauthILj1ELb0ELj1234EE``.
+
+If the vtable pointer authentication scheme of a polymorphic class is overridden
+we mangle the override information with the vendor qualifier
+``__vtptrauth(int key, bool addressDiscriminated, unsigned extraDiscriminator)``,
+where the extra discriminator is the explicit value the specified discrimination
+mode evalutes to.
+
+Blocks
+~~~~~~
+
+Block pointers are data pointers which must interoperate with the ObjC `id` type
+and therefore cannot be signed themselves. As blocks conform to the ObjC `id`
+type, they contain an ``isa`` pointer signed as described
+:ref:`below<Objc isa and super>`.
+
+The invocation pointer in a block is signed with the ``IA`` key using address
+diversity and a constant dicriminator of 0.  Using a uniform discriminator is
+seen as a weakness to be potentially improved, but this is tricky due to the
+subtype polymorphism directly permitted for blocks.
+
+Block descriptors and ``__block`` variables can contain pointers to functions
+that can be used to copy or destroy the object.  These functions are signed with
+the ``IA`` key, address diversity, and a constant discriminator of 0.  The
+structure of block descriptors is under consideration for improvement.
+
+Objective-C runtime
+~~~~~~~~~~~~~~~~~~~
+
+In addition to the compile time ABI design, the Objective-C runtime provides
+additional protection to methods and other metadata that have been loaded into
+the Objective-C method cache; this protection is private to the runtime.
+
+Objective-C methods
+~~~~~~~~~~~~~~~~~~~
+
+Objective-C method lists sign methods with the ``IA`` key using address
+diversity and a constant discriminator of 0.  Using a uniform constant
+discriminator is believed to be acceptable because these tables are only
+accessed internally to the Objective-C runtime.
+
+Objective-C class method list pointer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The method list pointer in Objective-C classes are signed with the ``DA`` key
+using address diversity, and a constant discriminator of 0xC310.
+
+Objective-C class read-only data pointer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The read-only data pointer in Objective-C classes are signed with the ``DA`` key
+using address diversity, and a constant discriminator of 0x61F8.
+
+.. _Objc isa and super:
+
+Objective-C ``isa`` and ``super`` pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An Objective-C object's ``isa`` and ``super`` pointers are both signed with
+the ``DA`` key using address diversity and constant discriminators of 0x6AE1
+and 0x25DA respectively.
+
+Objective-C ``SEL`` pointers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, the type of an Objective-C instance variable of type ``SEL``, when
+the qualifiers do not include an explicit ``__ptrauth`` qualifier, is adjusted
+to be qualified with ``__ptrauth(ptrauth_key_asdb, 1, 0x57C2)``.
+
+This provides a measure of implicit at-rest protection to  Objective-C classes
+that store selectors, as in the common target-action design pattern. This
+prevents attackers from overriding the selector to invoke an arbitrary different
+method, which is a major attack vector in Objective-C. Since ``SEL`` values are
+not normally passed around as signed pointers, there is a
+:ref:`signing oracle<Signing Oracles>` associated with the initialization of the
+ivar, but the use of address and constant diversity limit the risks.
+
+The implicit qualifier means that the type of the ivar does not match its
+declaration, which can cause type errors if the address of the ivar is taken:
+
+.. code-block:: ObjC
+
+  @interface A : NSObject {
+    SEL _s;
+  }
+  @end
+
+  void f(SEL *);
+
+  @implementation A
+  -(void)g
+  {
+     f(&_s);
+  }
+  @end
+
+To fix such an mismatch the schema macro from `<ptrauth.h>`:
+
+.. code-block:: ObjC
+
+  #include <ptrauth.h>
+
+  void f(SEL __ptrauth_objc_sel*);
 
+or less safely, and introducing the possibility of an
+:ref:`signing or authentication oracle<Signing oracles>`, an unauthencaticated
+temporary may be used as intermediate storage.
 
-Alternative Implementations
+Alternative implementations
 ---------------------------
 
-Signature Storage
+Signature storage
 ~~~~~~~~~~~~~~~~~
 
 It is not critical for the security of pointer authentication that the
@@ -536,7 +1667,7 @@ Storing the signature in the high bits, as Armv8.3 does, has several trade-offs:
   return signed pointers.  This means that clients of these APIs will not
   require insecure code in order to correctly receive a function pointer.
 
-Hashing vs. Encrypting Pointers
+Hashing vs. encrypting pointers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Armv8.3 implements ``sign`` by computing a cryptographic hash and storing that
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bd7a4b20242f..f03a3273c451 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -55,7 +55,7 @@ C/C++ Language Potentially Breaking Changes
   case for old-style offsetof idioms like ``((int)(&(((struct S *)0)->field)))``, to
   ensure they are not caught by these optimizations.  It is also possible to use
   ``-fwrapv-pointer`` or   ``-fno-delete-null-pointer-checks`` to make pointer arithmetic
-  on null pointers well-defined. (#GH130734, #GH130742, #GH130952)
+  on null pointers well-defined. (#GH130734, #GH130952)
 
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
@@ -123,6 +123,8 @@ C++ Language Changes
   a perfect match (all conversion sequences are identity conversions) template candidates are not instantiated.
   Diagnostics that would have resulted from the instantiation of these template candidates are no longer
   produced. This aligns Clang closer to the behavior of GCC, and fixes (#GH62096), (#GH74581), and (#GH74581).
+- Implemented `P2719R5 Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_
+  as an extension in all C++ language modes.
 
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -359,6 +361,12 @@ Non-comprehensive list of changes in this release
   ARC-managed pointers and other pointer types. The prior behavior was overly
   strict and inconsistent with the ARC specification.
 
+- Use of ``__has_feature`` to detect the ``ptrauth_qualifier`` and ``ptrauth_intrinsics``
+  features has been deprecated, and is restricted to the arm64e target only. The
+  correct method to check for these features is to test for the ``__PTRAUTH__``
+  macro.
+
+
 New Compiler Flags
 ------------------
 
@@ -376,6 +384,13 @@ New Compiler Flags
 
 - New options ``-g[no-]key-instructions`` added, disabled by default. Reduces jumpiness of debug stepping for optimized code in some debuggers (not LLDB at this time). Not recommended for use without optimizations. DWARF only. Note both the positive and negative flags imply ``-g``.
 
+- New options ``-fthinlto-distributor=`` and ``-Xthinlto-distributor=`` added for Integrated Distributed ThinLTO (DTLTO). DTLTO enables the distribution of backend ThinLTO compilations via external distribution systems, such as Incredibuild, during the traditional link step. (#GH147265, `ThinLTODocs <https://clang.llvm.org/docs/ThinLTO.html#integrated-distributed-thinlto-dtlto>`_).
+
+- A new flag - `-static-libclosure` was introduced to support statically linking
+  the runtime for the Blocks extension on Windows. This flag currently only
+  changes the code generation, and even then, only on Windows. This does not
+  impact the linker behaviour like the other `-static-*` flags.
+
 Deprecated Compiler Flags
 -------------------------
 
@@ -662,8 +677,8 @@ Improvements to Clang's diagnostics
   trigger a ``'Blue' is deprecated`` warning, which can be turned off with
   ``-Wno-deprecated-declarations-switch-case``.
 
-- Split diagnosis of implicit integer comparison on negation to a new
-  diagnostic group ``-Wimplicit-int-comparison-on-negation``, grouped under
+- Split diagnosis of implicit integer conversion on negation to a new
+  diagnostic group ``-Wimplicit-int-conversion-on-negation``, grouped under
   ``-Wimplicit-int-conversion``, so user can turn it off independently.
 
 - Improved the FixIts for unused lambda captures.
@@ -674,7 +689,7 @@ Improvements to Clang's diagnostics
   #GH142457, #GH139913, #GH138850, #GH137867, #GH137860, #GH107840, #GH93308,
   #GH69470, #GH59391, #GH58172, #GH46215, #GH45915, #GH45891, #GH44490,
   #GH36703, #GH32903, #GH23312, #GH69874.
-  
+
 - Clang no longer emits a spurious -Wdangling-gsl warning in C++23 when
   iterating over an element of a temporary container in a range-based
   for loop.(#GH109793, #GH145164)
@@ -800,6 +815,8 @@ Bug Fixes in This Version
   declaration statements. Clang now emits a warning for these patterns. (#GH141659)
 - Fixed false positives for redeclaration errors of using enum in
   nested scopes. (#GH147495)
+- Fixed a crash in `clang-scan-deps` when a module with the same name is found
+  in different locations (#GH134404, #GH146976).
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -949,7 +966,6 @@ Bug Fixes to C++ Support
 - Fixed an access checking bug when initializing non-aggregates in default arguments (#GH62444), (#GH83608)
 - Fixed a pack substitution bug in deducing class template partial specializations. (#GH53609)
 - Fixed a crash when constant evaluating some explicit object member assignment operators. (#GH142835)
-- Fixed an access checking bug when substituting into concepts (#GH115838)
 - Fix a bug where private access specifier of overloaded function not respected. (#GH107629)
 - Correctly handles calling an explicit object member function template overload set
   through its address (``(&Foo::bar<baz>)()``).
@@ -967,6 +983,7 @@ Bug Fixes to C++ Support
 - Fix a crash with NTTP when instantiating local class.
 - Fixed a crash involving list-initialization of an empty class with a
   non-empty initializer list. (#GH147949)
+- Fixed constant evaluation of equality comparisons of constexpr-unknown references. (#GH147663)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -993,6 +1010,19 @@ Miscellaneous Clang Crashes Fixed
 OpenACC Specific Changes
 ------------------------
 
+- OpenACC support, enabled via `-fopenacc` has reached a level of completeness
+  to finally be at least notionally usable. Currently, the OpenACC 3.4
+  specification has been completely implemented for Sema and AST creation, so
+  nodes will show up in the AST after having been properly checked. Lowering is
+  currently a work in progress, with compute, loop, and combined constructs
+  partially implemented, plus a handful of data and executable constructs
+  implemented. Lowering will only work in Clang-IR mode (so only with a compiler
+  built with Clang-IR enabled, and with `-fclangir` used on the command line).
+  However, note that the Clang-IR implementation status is also quite partial,
+  so frequent 'not yet implemented' diagnostics should be expected.  Also, the
+  ACC MLIR dialect does not currently implement any lowering to LLVM-IR, so no
+  code generation is possible for OpenACC.
+
 Target Specific Changes
 -----------------------
 
@@ -1071,6 +1101,8 @@ Windows Support
   extensions are enabled. This allows for properly processing ``intsafe.h`` in
   the Windows SDK.
 
+- Clang now supports `mipsel-windows-gnu` and `mipsel-windows-msvc` targets.
+
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
 
@@ -1119,9 +1151,19 @@ CUDA/HIP Language Changes
 CUDA Support
 ^^^^^^^^^^^^
 
+PowerPC Support
+^^^^^^^^^^^^^^^
+
+* Add `__dmr1024` type for Dense Math Facility.
+* Add prototype for Dense Math Facility integer calculation builtins.
+
 AIX Support
 ^^^^^^^^^^^
 
+* Fixed `-print-runtime-dir` to fallback to the target subdirectory (rather than OS subdirectory) if the runtime path is not found.
+* Fixed `-print-runtime-dir` to find the correct runtime path if the triple has "unknown" as the environment component.
+* Changed AIX targets to use the per-target runtime directories for compiler runtimes (i.e. `lib/clang/20/lib/aix` became `lib/clang/21/lib/powerpc-ibm-aix` and `clang/21/lib/powerpc64-ibm-aix`).
+
 NetBSD Support
 ^^^^^^^^^^^^^^
 
@@ -1192,53 +1234,112 @@ Code Completion
 
 Static Analyzer
 ---------------
-- Fixed a crash when C++20 parenthesized initializer lists are used. This issue
-  was causing a crash in clang-tidy. (#GH136041)
 
 New features
 ^^^^^^^^^^^^
 
-- A new flag - `-static-libclosure` was introduced to support statically linking
-  the runtime for the Blocks extension on Windows. This flag currently only
-  changes the code generation, and even then, only on Windows. This does not
-  impact the linker behaviour like the other `-static-*` flags.
-- OpenACC support, enabled via `-fopenacc` has reached a level of completeness
-  to finally be at least notionally usable. Currently, the OpenACC 3.4
-  specification has been completely implemented for Sema and AST creation, so
-  nodes will show up in the AST after having been properly checked. Lowering is
-  currently a work in progress, with compute, loop, and combined constructs
-  partially implemented, plus a handful of data and executable constructs
-  implemented. Lowering will only work in Clang-IR mode (so only with a compiler
-  built with Clang-IR enabled, and with `-fclangir` used on the command line).
-  However, note that the Clang-IR implementation status is also quite partial,
-  so frequent 'not yet implemented' diagnostics should be expected.  Also, the
-  ACC MLIR dialect does not currently implement any lowering to LLVM-IR, so no
-  code generation is possible for OpenACC.
-- Implemented `P2719R5 Type-aware allocation and deallocation functions <https://wg21.link/P2719>`_
-  as an extension in all C++ language modes.
+- Added support for the ``[[clang::assume(cond)]]`` attribute, treating it as
+  ``__builtin_assume(cond)`` for better static analysis. (#GH129234)
+
+- Introduced per-entry-point statistics to provide more detailed analysis metrics.
+  Documentation: :doc:`analyzer/developer-docs/Statistics` (#GH131175)
+
+- Added time-trace scopes for high-level analyzer steps to improve performance
+  debugging. Documentation: :doc:`analyzer/developer-docs/PerformanceInvestigation`
+  (#GH125508, #GH125884)
 
+- Enhanced the ``check::BlockEntrance`` checker callback to provide more granular
+  control over block-level analysis.
+  `Documentation (check::BlockEntrance)
+  <https://clang.llvm.org/doxygen/CheckerDocumentation_8cpp_source.html>`_
+  (#GH140924)
+
+- Added a new checker ``core.FixedAddressDereference`` to detect dereferences
+  of fixed addresses, which can be useful for finding hard-coded memory
+  accesses. (#GH127191, #GH132404)
 
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
-- Fixed a crash in ``UnixAPIMisuseChecker`` and ``MallocChecker`` when analyzing
+- Fixed a crash when C++20 parenthesized initializer lists are used.
+  This affected a crash of the well-known lambda overloaded pattern.
+  (#GH136041, #GH135665)
+
+- Dropped an unjustified assertion, that was triggered in ``BugReporterVisitors.cpp``
+  for variable initialization detection. (#GH125044)
+
+- Fixed a crash in ``unix.API`` and ``unix.Malloc`` when analyzing
   code with non-standard ``getline`` or ``getdelim`` function signatures. (#GH144884)
 
+- Fixed crashes involving ``__builtin_bit_cast``. (#GH139188)
+
+- ``__datasizeof`` (C++) and ``_Countof`` (C) no longer cause a failed assertion
+  when given an operand of VLA type. (#GH151711)
+
+- Fixed a crash in ``alpha.core.CastSize``. (#GH134387)
+
+- Some ``cplusplus.PlacementNew`` false positives were fixed. (#GH150161)
+
 Improvements
 ^^^^^^^^^^^^
 
+- Added option to assume at least one iteration in loops to reduce false positives.
+  (#GH125494)
+
 - The checker option ``optin.cplusplus.VirtualCall:PureOnly`` was removed,
-  because it had been deprecated since 2019 and it is completely useless (it
-  was kept only for compatibility with pre-2019 versions, setting it to true is
-  equivalent to completely disabling the checker).
+  because it had been deprecated since 2019. (#GH131823)
+
+- Enhanced the ``core.StackAddressEscape`` to detect more cases of stack address
+  escapes, including return values for child stack frames. (#GH126620, #GH126986)
+
+- Improved the ``unix.BlockInCriticalSection`` to recognize ``O_NONBLOCK``
+  streams and suppress reports in those cases. (#GH127049)
+
+- Better support for lambda-converted function pointers in analysis. (#GH144906)
+
+- Improved modeling of ``getcwd`` function in ``unix.StdCLibraryFunctions`` checker.
+  (#GH141076)
+
+- Enhanced the ``optin.core.EnumCastOutOfRange`` checker to ignore ``[[clang::flag_enum]]``
+  enums. (#GH141232)
+
+- Improved handling of structured bindings captured by lambdas. (#GH132579, #GH91835)
+
+- Fixed unnamed bitfield handling in ``optin.cplusplus.UninitializedObject``. (#GH132427, #GH132001)
+
+- Enhanced iterator checker modeling for ``insert`` operations. (#GH132596)
+
+- Improved ``format`` attribute handling in ``optin.taint.GenericTaint``. (#GH132765)
+
+- Added support for ``consteval`` in ``ConditionBRVisitor::VisitTerminator``.
+  (#GH146859, #GH139130)
+
+- C standard streams are no longer invalidated by all C library function calls.
+  (#GH147766)
+
+- Enhanced store management with region-store-binding-limit to improve performance.
+  See `region-store-max-binding-fanout
+  <https://clang.llvm.org/docs/analyzer/user-docs/Options.html#region-store-max-binding-fanout>`_
+  config option. Overriding these options are discouraged, unless you know what you do.
+  (#GH127602)
+
+- Updated undefined assignment checker (``core.uninitialized.Assign``) diagnostics
+  to avoid using the term ``garbage``. (#GH126596)
+
+- Fixed false memory leak reports involving placement new. (#GH144341)
+
+- Avoided unnecessary super region invalidation in ``unix.cstring.*`` checkers.
+  (#GH146212, #GH143807)
+
+- Enhanced handling of tainted division-by-zero error paths in the
+  ``optin.taint.TaintedDiv`` checker. (#GH144491)
 
 Moved checkers
 ^^^^^^^^^^^^^^
 
-- After lots of improvements, the checker ``alpha.security.ArrayBoundV2`` is
+- After lots of improvements, the checker ``alpha.security.ArrayBoundV2`` was
   renamed to ``security.ArrayBound``. As this checker is stable now, the old
-  checker ``alpha.security.ArrayBound`` (which was searching for the same kind
-  of bugs with an different, simpler and less accurate algorithm) is removed.
+  checker ``alpha.security.ArrayBound`` was removed.
 
 .. _release-notes-sanitizers:
 
diff --git a/clang/docs/ThinLTO.rst b/clang/docs/ThinLTO.rst
index c04254767891..8cb3e0b2b0d1 100644
--- a/clang/docs/ThinLTO.rst
+++ b/clang/docs/ThinLTO.rst
@@ -240,6 +240,53 @@ The ``BOOTSTRAP_LLVM_ENABLE_LTO=Thin`` will enable ThinLTO for stage 2 and
 stage 3 in case the compiler used for stage 1 does not support the ThinLTO
 option.
 
+Integrated Distributed ThinLTO (DTLTO)
+--------------------------------------
+
+Integrated Distributed ThinLTO (DTLTO) enables the distribution of backend
+ThinLTO compilations via external distribution systems, such as Incredibuild,
+during the traditional link step.
+
+The implementation is documented here: https://llvm.org/docs/DTLTO.html.
+
+Command-Line Options
+^^^^^^^^^^^^^^^^^^^^
+
+DTLTO requires the LLD linker (``-fuse-ld=lld``).
+
+``-fthinlto-distributor=<path>``
+   - Specifies the ``<path>`` to the distributor process executable for DTLTO.
+   - If specified, ThinLTO backend compilations will be distributed by LLD.
+
+``-Xthinlto-distributor=<arg>``
+   - Passes ``<arg>`` to the distributor process (see ``-fthinlto-distributor=``).
+   - Can be specified multiple times to pass multiple options.
+   - Multiple options can also be specified by separating them with commas.
+
+If ``-fthinlto-distributor=`` is specified, Clang supplies the path to a
+compiler to be executed remotely to perform the ThinLTO backend
+compilations. Currently, this is Clang itself.
+
+Usage
+^^^^^
+
+Compilation is unchanged from ThinLTO. DTLTO options need to supplied for the link step:
+
+.. code-block:: console
+
+  % clang -flto=thin -fthinlto-distributor=distribute.sh -Xthinlto-distributor=--verbose,--j10 -fuse-ld=lld file1.o file2.o
+  % clang -flto=thin -fthinlto-distributor=$(which python) -Xthinlto-distributor=distribute.py -fuse-ld=lld file1.o file2.o
+
+When using lld-link:
+
+.. code-block:: console
+
+  % lld-link /out:a.exe file1.obj file2.obj /thinlto-distributor:distribute.exe /thinlto-remote-compiler:${LLVM}\bin\clang.exe /thinlto-distributor-arg:--verbose
+
+Note that currently, DTLTO is only supported in some LLD flavors. Support can
+be added to other LLD flavours in the future.
+See `DTLTO <https://lld.llvm.org/DTLTO.html>`_ for more information.
+
 More Information
 ================
 
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index 4871d05e932a..f603ce34a081 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -45,6 +45,7 @@ Using Clang as a Compiler
    BoundsSafetyImplPlans
    ControlFlowIntegrity
    LTOVisibility
+   PointerAuthentication
    SafeStack
    ShadowCallStack
    SourceBasedCodeCoverage
diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h
index c35311c88641..b929585205ae 100644
--- a/clang/include/clang-c/Index.h
+++ b/clang/include/clang-c/Index.h
@@ -6953,6 +6953,21 @@ clang_getCursorUnaryOperatorKind(CXCursor cursor);
  * @}
  */
 
+CINDEX_DEPRECATED
+typedef void *CXRemapping;
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping clang_getRemappings(const char *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE CXRemapping
+clang_getRemappingsFromFileList(const char **, unsigned);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE unsigned clang_remap_getNumFiles(CXRemapping);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void
+clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *);
+
+CINDEX_DEPRECATED CINDEX_LINKAGE void clang_remap_dispose(CXRemapping);
+
 LLVM_CLANG_C_EXTERN_C_END
 
 #endif
diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h
index 9999a30c51ad..cb942ea865e2 100644
--- a/clang/include/clang/AST/APValue.h
+++ b/clang/include/clang/AST/APValue.h
@@ -143,7 +143,7 @@ class APValue {
     AddrLabelDiff
   };
 
-  class LValueBase {
+  class alignas(uint64_t) LValueBase {
     typedef llvm::PointerUnion<const ValueDecl *, const Expr *, TypeInfoLValue,
                                DynamicAllocLValue>
         PtrTy;
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8c27728c404d..993eca8c4738 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -648,8 +648,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   bool containsNonRelocatablePointerAuth(QualType T) {
     if (!isPointerAuthenticationAvailable())
       return false;
-    return findPointerAuthContent(T) ==
-           PointerAuthContent::AddressDiscriminatedData;
+    return findPointerAuthContent(T) != PointerAuthContent::None;
   }
 
 private:
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index fefdaba7f8bf..76747d2b1181 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -9417,9 +9417,9 @@ def NonStringDocs : Documentation {
   let Category = DocCatDecl;
   let Content = [{
 The ``nonstring`` attribute can be applied to the declaration of a variable or
-a field whose type is a character array to specify that the character array is
-not intended to behave like a null-terminated string. This will silence
-diagnostics with code like:
+a field whose type is a character pointer or character array to specify that
+the buffer is not intended to behave like a null-terminated string. This will
+silence diagnostics with code like:
 
 .. code-block:: c
 
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index a67b9995d3b5..4c7219c78c8b 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -507,6 +507,14 @@ def note_odr_number_of_bases : Note<
   "class has %0 base %plural{1:class|:classes}0">;
 def note_odr_enumerator : Note<"enumerator %0 with value %1 here">;
 def note_odr_missing_enumerator : Note<"no corresponding enumerator here">;
+def note_odr_incompatible_fixed_underlying_type : Note<
+  "enumeration %0 declared with incompatible fixed underlying types (%1 vs. "
+  "%2)">;
+def note_odr_fixed_underlying_type : Note<
+  "enumeration %0 has fixed underlying type here">;
+def note_odr_missing_fixed_underlying_type : Note<
+  "enumeration %0 missing fixed underlying type here">;
+
 def err_odr_field_type_inconsistent : Error<
   "field %0 declared with incompatible types in different "
   "translation units (%1 vs. %2)">;
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 72f23614aef1..b040181beaff 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -147,14 +147,17 @@ FEATURE(type_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Type))
 FEATURE(thread_sanitizer, LangOpts.Sanitize.has(SanitizerKind::Thread))
 FEATURE(dataflow_sanitizer, LangOpts.Sanitize.has(SanitizerKind::DataFlow))
 FEATURE(scudo, LangOpts.Sanitize.hasOneOf(SanitizerKind::Scudo))
-FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics)
-EXTENSION(ptrauth_qualifier, LangOpts.PointerAuthIntrinsics)
+FEATURE(ptrauth_intrinsics, LangOpts.PointerAuthIntrinsics &&
+                            PP.getTargetInfo().getTriple().isOSDarwin())
+FEATURE(ptrauth_qualifier, LangOpts.PointerAuthIntrinsics &&
+                           PP.getTargetInfo().getTriple().isOSDarwin())
 FEATURE(ptrauth_calls, LangOpts.PointerAuthCalls)
 FEATURE(ptrauth_returns, LangOpts.PointerAuthReturns)
 FEATURE(ptrauth_vtable_pointer_address_discrimination, LangOpts.PointerAuthVTPtrAddressDiscrimination)
 FEATURE(ptrauth_vtable_pointer_type_discrimination, LangOpts.PointerAuthVTPtrTypeDiscrimination)
 FEATURE(ptrauth_type_info_vtable_pointer_discrimination, LangOpts.PointerAuthTypeInfoVTPtrDiscrimination)
 FEATURE(ptrauth_member_function_pointer_type_discrimination, LangOpts.PointerAuthCalls)
+FEATURE(ptrauth_signed_block_descriptors, LangOpts.PointerAuthBlockDescriptorPointers)
 FEATURE(ptrauth_function_pointer_type_discrimination, LangOpts.PointerAuthFunctionTypeDiscrimination)
 FEATURE(ptrauth_indirect_gotos, LangOpts.PointerAuthIndirectGotos)
 FEATURE(ptrauth_init_fini, LangOpts.PointerAuthInitFini)
@@ -163,7 +166,7 @@ FEATURE(ptrauth_elf_got, LangOpts.PointerAuthELFGOT)
 
 FEATURE(ptrauth_objc_isa, LangOpts.PointerAuthObjcIsa)
 FEATURE(ptrauth_objc_interface_sel, LangOpts.PointerAuthObjcInterfaceSel)
-FEATURE(ptrauth_objc_signable_class, true)
+FEATURE(ptrauth_objc_signable_class, LangOpts.PointerAuthIntrinsics)
 FEATURE(ptrauth_objc_method_list_pointer, LangOpts.PointerAuthCalls)
 
 EXTENSION(swiftcc,
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 0d546cb3b847..25f4575a5425 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -138,6 +138,8 @@ LANGOPT(PointerAuthObjcInterfaceSel, 1, 0, NotCompatible, "authentication of SEL
 LANGOPT(PointerAuthObjcInterfaceSelKey, 16, 0, NotCompatible, "authentication key for SEL fields of ObjC interfaces")
 LANGOPT(PointerAuthObjcClassROPointers, 1, 0, Benign, "class_ro_t pointer authentication")
 
+LANGOPT(PointerAuthBlockDescriptorPointers, 1, 0, NotCompatible, "enable signed block descriptors")
+
 LANGOPT(DoubleSquareBracketAttributes, 1, 0, NotCompatible, "'[[]]' attributes extension for all language standard modes")
 LANGOPT(ExperimentalLateParseAttributes, 1, 0, NotCompatible, "experimental late parsing of attributes")
 
@@ -496,6 +498,8 @@ LANGOPT(CheckConstexprFunctionBodies, 1, 1, Benign,
 
 LANGOPT(BoundsSafety, 1, 0, NotCompatible, "Bounds safety extension for C")
 
+LANGOPT(EnableLifetimeSafety, 1, 0, NotCompatible, "Experimental lifetime safety analysis for C++")
+
 LANGOPT(PreserveVec3Type, 1, 0, NotCompatible, "Preserve 3-component vector type")
 
 #undef LANGOPT
diff --git a/clang/include/clang/Basic/PointerAuthOptions.h b/clang/include/clang/Basic/PointerAuthOptions.h
index fb6dddf3ae9c..2b920250721f 100644
--- a/clang/include/clang/Basic/PointerAuthOptions.h
+++ b/clang/include/clang/Basic/PointerAuthOptions.h
@@ -23,6 +23,10 @@
 
 namespace clang {
 
+/// Constant discriminator to be used with block descriptor pointers. The value
+/// is ptrauth_string_discriminator("block_descriptor")
+constexpr uint16_t BlockDescriptorConstantDiscriminator = 0xC0BB;
+
 /// Constant discriminator to be used with function pointers in .init_array and
 /// .fini_array. The value is ptrauth_string_discriminator("init_fini")
 constexpr uint16_t InitFiniPointerConstantDiscriminator = 0xD9D4;
@@ -223,6 +227,18 @@ struct PointerAuthOptions {
   /// The ABI for function addresses in .init_array and .fini_array
   PointerAuthSchema InitFiniPointers;
 
+  /// The ABI for block invocation function pointers.
+  PointerAuthSchema BlockInvocationFunctionPointers;
+
+  /// The ABI for block object copy/destroy function pointers.
+  PointerAuthSchema BlockHelperFunctionPointers;
+
+  /// The ABI for __block variable copy/destroy function pointers.
+  PointerAuthSchema BlockByrefHelperFunctionPointers;
+
+  /// The ABI for pointers to block descriptors.
+  PointerAuthSchema BlockDescriptorPointers;
+
   /// The ABI for Objective-C method lists.
   PointerAuthSchema ObjCMethodListFunctionPointers;
 
diff --git a/clang/include/clang/Driver/Multilib.h b/clang/include/clang/Driver/Multilib.h
index fc071ef48ca0..10c4457773f7 100644
--- a/clang/include/clang/Driver/Multilib.h
+++ b/clang/include/clang/Driver/Multilib.h
@@ -35,12 +35,18 @@ class Driver;
 class Multilib {
 public:
   using flags_list = std::vector<std::string>;
+  // Downstream issue: #446 (Extend the Multilib system to support an
+  // IncludeDirs field)
+  using includedirs_list = std::vector<std::string>;
 
 private:
   std::string GCCSuffix;
   std::string OSSuffix;
   std::string IncludeSuffix;
   flags_list Flags;
+  // Downstream issue: #446 (Extend the Multilib system to support an
+  // IncludeDirs field)
+  includedirs_list IncludeDirs;
 
   // Optionally, a multilib can be assigned a string tag indicating that it's
   // part of a group of mutually exclusive possibilities. If two or more
@@ -62,6 +68,9 @@ class Multilib {
   /// This is enforced with an assert in the constructor.
   Multilib(StringRef GCCSuffix = {}, StringRef OSSuffix = {},
            StringRef IncludeSuffix = {}, const flags_list &Flags = flags_list(),
+           // Downstream issue: #446 (Extend the Multilib system to support an
+           // IncludeDirs field)
+           const includedirs_list &IncludeDirs = includedirs_list(),
            StringRef ExclusiveGroup = {},
            std::optional<StringRef> Error = std::nullopt);
 
@@ -81,6 +90,12 @@ class Multilib {
   /// All elements begin with either '-' or '!'
   const flags_list &flags() const { return Flags; }
 
+  // Downstream issue: #446 (Extend the Multilib system to support an
+  // IncludeDirs field)
+  /// Get the include directories specified in multilib.yaml under the
+  /// 'IncludeDirs' field
+  const includedirs_list &includeDirs() const { return IncludeDirs; }
+
   /// Get the exclusive group label.
   const std::string &exclusiveGroup() const { return ExclusiveGroup; }
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index e7c7e9d93fe2..958d0d05ade2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -990,6 +990,13 @@ def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
   Visibility<[ClangOption, CLOption, FlangOption]>,
   HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
   Group<Link_Group>;
+def Xthinlto_distributor_EQ : CommaJoined<["-"], "Xthinlto-distributor=">,
+  Flags<[LinkOption]>,
+  Visibility<[ClangOption, CLOption]>,
+  HelpText<"Pass <arg> to the ThinLTO distributor process. Can be specified "
+           "multiple times or with comma-separated values.">,
+  MetaVarName<"<arg>">,
+  Group<Link_Group>;
 def Xoffload_linker : JoinedAndSeparate<["-"], "Xoffload-linker">,
   Visibility<[ClangOption, FlangOption]>,
   HelpText<"Pass <arg> to the offload linkers or the ones identified by -<triple>">,
@@ -1910,6 +1917,14 @@ defm bounds_safety : BoolFOption<
   BothFlags<[], [CC1Option],
           " experimental bounds safety extension for C">>;
 
+defm lifetime_safety : BoolFOption<
+  "experimental-lifetime-safety",
+  LangOpts<"EnableLifetimeSafety">, DefaultFalse,
+  PosFlag<SetTrue, [], [CC1Option], "Enable">,
+  NegFlag<SetFalse, [], [CC1Option], "Disable">,
+  BothFlags<[], [CC1Option],
+          " experimental lifetime safety for C++">>;
+
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option], "Emit">,
@@ -4249,7 +4264,12 @@ def ffinite_loops: Flag<["-"],  "ffinite-loops">, Group<f_Group>,
 def fno_finite_loops: Flag<["-"], "fno-finite-loops">, Group<f_Group>,
   HelpText<"Do not assume that any loop is finite.">,
   Visibility<[ClangOption, CC1Option]>;
-
+def fthinlto_distributor_EQ : Joined<["-"], "fthinlto-distributor=">,
+  Group<f_Group>,
+  HelpText<"Path to the ThinLTO distributor process. If specified, "
+           "ThinLTO backend compilations will be distributed by LLD">,
+  MetaVarName<"<path>">,
+  Visibility<[ClangOption, CLOption]>;
 def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
   HelpText<"Process trigraph sequences">, Visibility<[ClangOption, CC1Option]>;
 def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
@@ -4507,6 +4527,7 @@ defm aarch64_jump_table_hardening: OptInCC1FFlag<"aarch64-jump-table-hardening",
 defm ptrauth_objc_isa : OptInCC1FFlag<"ptrauth-objc-isa", "Enable signing and authentication of Objective-C object's 'isa' field">;
 defm ptrauth_objc_interface_sel : OptInCC1FFlag<"ptrauth-objc-interface-sel", "Enable signing and authentication of Objective-C object's 'SEL' fields">;
 defm ptrauth_objc_class_ro : OptInCC1FFlag<"ptrauth-objc-class-ro", "Enable signing and authentication for ObjC class_ro pointers">;
+defm ptrauth_block_descriptor_pointers : OptInCC1FFlag<"ptrauth-block-descriptor-pointers", "Enable signing and authentication of block descriptors">;
 }
 
 def fenable_matrix : Flag<["-"], "fenable-matrix">, Group<f_Group>,
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index b4f2a87fe7e8..f0d0000c42a9 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -4694,6 +4694,13 @@ struct FormatStyle {
     ///      <conditional-body>                     <conditional-body>
     /// \endcode
     bool AfterIfMacros;
+    /// If ``true``, put a space between alternative operator ``not`` and the
+    /// opening parenthesis.
+    /// \code
+    ///    true:                                  false:
+    ///    return not (a || b);            vs.    return not(a || b);
+    /// \endcode
+    bool AfterNot;
     /// If ``true``, put a space between operator overloading and opening
     /// parentheses.
     /// \code
@@ -4742,9 +4749,9 @@ struct FormatStyle {
         : AfterControlStatements(false), AfterForeachMacros(false),
           AfterFunctionDeclarationName(false),
           AfterFunctionDefinitionName(false), AfterIfMacros(false),
-          AfterOverloadedOperator(false), AfterPlacementOperator(true),
-          AfterRequiresInClause(false), AfterRequiresInExpression(false),
-          BeforeNonEmptyParentheses(false) {}
+          AfterNot(false), AfterOverloadedOperator(false),
+          AfterPlacementOperator(true), AfterRequiresInClause(false),
+          AfterRequiresInExpression(false), BeforeNonEmptyParentheses(false) {}
 
     bool operator==(const SpaceBeforeParensCustom &Other) const {
       return AfterControlStatements == Other.AfterControlStatements &&
@@ -4753,6 +4760,7 @@ struct FormatStyle {
                  Other.AfterFunctionDeclarationName &&
              AfterFunctionDefinitionName == Other.AfterFunctionDefinitionName &&
              AfterIfMacros == Other.AfterIfMacros &&
+             AfterNot == Other.AfterNot &&
              AfterOverloadedOperator == Other.AfterOverloadedOperator &&
              AfterPlacementOperator == Other.AfterPlacementOperator &&
              AfterRequiresInClause == Other.AfterRequiresInClause &&
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 06971ff87ab9..423f2ffe2f85 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -143,9 +143,6 @@ class Lexer : public PreprocessorLexer {
   /// True if this is the first time we're lexing the input file.
   bool IsFirstTimeLexingFile;
 
-  /// True if current lexing token is the first pp-token.
-  bool IsFirstPPToken;
-
   // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
   // it also points to '\n.'
   const char *NewLinePtr;
diff --git a/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h b/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h
new file mode 100644
index 000000000000..9ab3c6a528a1
--- /dev/null
+++ b/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h
@@ -0,0 +1,310 @@
+//===--- NoTrivialPPDirectiveTracer.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the NoTrivialPPDirectiveTracer interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H
+#define LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H
+
+#include "clang/Lex/PPCallbacks.h"
+
+namespace clang {
+class Preprocessor;
+
+/// Consider the following code:
+///
+/// # 1 __FILE__ 1 3
+/// export module a;
+///
+/// According to the wording in
+/// [P1857R3](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1857r3.html):
+///
+///   A module directive may only appear as the first preprocessing tokens in a
+///   file (excluding the global module fragment.)
+///
+/// and the wording in
+/// [[cpp.pre]](https://eel.is/c++draft/cpp.pre#nt:module-file):
+///   module-file:
+///     pp-global-module-fragment[opt] pp-module group[opt]
+///     pp-private-module-fragment[opt]
+///
+/// `#` is the first pp-token in the translation unit, and it was rejected by
+/// clang, but they really should be exempted from this rule. The goal is to not
+/// allow any preprocessor conditionals or most state changes, but these don't
+/// fit that.
+///
+/// State change would mean most semantically observable preprocessor state,
+/// particularly anything that is order dependent. Global flags like being a
+/// system header/module shouldn't matter.
+///
+/// We should exempt a brunch of directives, even though it violates the current
+/// standard wording.
+///
+/// This class used to trace 'no-trivial' pp-directives in main file, which may
+/// change the preprocessing state.
+///
+/// FIXME: Once the wording of the standard is revised, we need to follow the
+/// wording of the standard. Currently this is just a workaround
+class NoTrivialPPDirectiveTracer : public PPCallbacks {
+  Preprocessor &PP;
+
+  /// Whether preprocessing main file. We only focus on the main file.
+  bool InMainFile = true;
+
+  /// Whether one or more conditional, include or other 'no-trivial'
+  /// pp-directives has seen before.
+  bool SeenNoTrivialPPDirective = false;
+
+  void setSeenNoTrivialPPDirective();
+
+public:
+  NoTrivialPPDirectiveTracer(Preprocessor &P) : PP(P) {}
+
+  bool hasSeenNoTrivialPPDirective() const;
+
+  /// Callback invoked whenever the \p Lexer moves to a different file for
+  /// lexing. Unlike \p FileChanged line number directives and other related
+  /// pragmas do not trigger callbacks to \p LexedFileChanged.
+  ///
+  /// \param FID The \p FileID that the \p Lexer moved to.
+  ///
+  /// \param Reason Whether the \p Lexer entered a new file or exited one.
+  ///
+  /// \param FileType The \p CharacteristicKind of the file the \p Lexer moved
+  /// to.
+  ///
+  /// \param PrevFID The \p FileID the \p Lexer was using before the change.
+  ///
+  /// \param Loc The location where the \p Lexer entered a new file from or the
+  /// location that the \p Lexer moved into after exiting a file.
+  void LexedFileChanged(FileID FID, LexedFileChangeReason Reason,
+                        SrcMgr::CharacteristicKind FileType, FileID PrevFID,
+                        SourceLocation Loc) override;
+
+  /// Callback invoked whenever an embed directive has been processed,
+  /// regardless of whether the embed will actually find a file.
+  ///
+  /// \param HashLoc The location of the '#' that starts the embed directive.
+  ///
+  /// \param FileName The name of the file being included, as written in the
+  /// source code.
+  ///
+  /// \param IsAngled Whether the file name was enclosed in angle brackets;
+  /// otherwise, it was enclosed in quotes.
+  ///
+  /// \param File The actual file that may be included by this embed directive.
+  ///
+  /// \param Params The parameters used by the directive.
+  void EmbedDirective(SourceLocation HashLoc, StringRef FileName, bool IsAngled,
+                      OptionalFileEntryRef File,
+                      const LexEmbedParametersResult &Params) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Callback invoked whenever an inclusion directive of
+  /// any kind (\c \#include, \c \#import, etc.) has been processed, regardless
+  /// of whether the inclusion will actually result in an inclusion.
+  ///
+  /// \param HashLoc The location of the '#' that starts the inclusion
+  /// directive.
+  ///
+  /// \param IncludeTok The token that indicates the kind of inclusion
+  /// directive, e.g., 'include' or 'import'.
+  ///
+  /// \param FileName The name of the file being included, as written in the
+  /// source code.
+  ///
+  /// \param IsAngled Whether the file name was enclosed in angle brackets;
+  /// otherwise, it was enclosed in quotes.
+  ///
+  /// \param FilenameRange The character range of the quotes or angle brackets
+  /// for the written file name.
+  ///
+  /// \param File The actual file that may be included by this inclusion
+  /// directive.
+  ///
+  /// \param SearchPath Contains the search path which was used to find the file
+  /// in the file system. If the file was found via an absolute include path,
+  /// SearchPath will be empty. For framework includes, the SearchPath and
+  /// RelativePath will be split up. For example, if an include of "Some/Some.h"
+  /// is found via the framework path
+  /// "path/to/Frameworks/Some.framework/Headers/Some.h", SearchPath will be
+  /// "path/to/Frameworks/Some.framework/Headers" and RelativePath will be
+  /// "Some.h".
+  ///
+  /// \param RelativePath The path relative to SearchPath, at which the include
+  /// file was found. This is equal to FileName except for framework includes.
+  ///
+  /// \param SuggestedModule The module suggested for this header, if any.
+  ///
+  /// \param ModuleImported Whether this include was translated into import of
+  /// \p SuggestedModule.
+  ///
+  /// \param FileType The characteristic kind, indicates whether a file or
+  /// directory holds normal user code, system code, or system code which is
+  /// implicitly 'extern "C"' in C++ mode.
+  ///
+  void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
+                          StringRef FileName, bool IsAngled,
+                          CharSourceRange FilenameRange,
+                          OptionalFileEntryRef File, StringRef SearchPath,
+                          StringRef RelativePath, const Module *SuggestedModule,
+                          bool ModuleImported,
+                          SrcMgr::CharacteristicKind FileType) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Callback invoked whenever there was an explicit module-import
+  /// syntax.
+  ///
+  /// \param ImportLoc The location of import directive token.
+  ///
+  /// \param Path The identifiers (and their locations) of the module
+  /// "path", e.g., "std.vector" would be split into "std" and "vector".
+  ///
+  /// \param Imported The imported module; can be null if importing failed.
+  ///
+  void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path,
+                    const Module *Imported) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Callback invoked when the end of the main file is reached.
+  ///
+  /// No subsequent callbacks will be made.
+  void EndOfMainFile() override { setSeenNoTrivialPPDirective(); }
+
+  /// Callback invoked when start reading any pragma directive.
+  void PragmaDirective(SourceLocation Loc,
+                       PragmaIntroducerKind Introducer) override {}
+
+  /// Called by Preprocessor::HandleMacroExpandedIdentifier when a
+  /// macro invocation is found.
+  void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD,
+                    SourceRange Range, const MacroArgs *Args) override;
+
+  /// Hook called whenever a macro definition is seen.
+  void MacroDefined(const Token &MacroNameTok,
+                    const MacroDirective *MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever a macro \#undef is seen.
+  /// \param MacroNameTok The active Token
+  /// \param MD A MacroDefinition for the named macro.
+  /// \param Undef New MacroDirective if the macro was defined, null otherwise.
+  ///
+  /// MD is released immediately following this callback.
+  void MacroUndefined(const Token &MacroNameTok, const MacroDefinition &MD,
+                      const MacroDirective *Undef) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever the 'defined' operator is seen.
+  /// \param MD The MacroDirective if the name was a macro, null otherwise.
+  void Defined(const Token &MacroNameTok, const MacroDefinition &MD,
+               SourceRange Range) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#if is seen.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param ConditionValue The evaluated value of the condition.
+  ///
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void If(SourceLocation Loc, SourceRange ConditionRange,
+          ConditionValueKind ConditionValue) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#elif is seen.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param ConditionValue The evaluated value of the condition.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void Elif(SourceLocation Loc, SourceRange ConditionRange,
+            ConditionValueKind ConditionValue, SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#ifdef is seen.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefinition if the name was a macro, null otherwise.
+  void Ifdef(SourceLocation Loc, const Token &MacroNameTok,
+             const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#elifdef branch is taken.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefinition if the name was a macro, null otherwise.
+  void Elifdef(SourceLocation Loc, const Token &MacroNameTok,
+               const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+  /// Hook called whenever an \#elifdef is skipped.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void Elifdef(SourceLocation Loc, SourceRange ConditionRange,
+               SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#ifndef is seen.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefiniton if the name was a macro, null otherwise.
+  void Ifndef(SourceLocation Loc, const Token &MacroNameTok,
+              const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#elifndef branch is taken.
+  /// \param Loc the source location of the directive.
+  /// \param MacroNameTok Information on the token being tested.
+  /// \param MD The MacroDefinition if the name was a macro, null otherwise.
+  void Elifndef(SourceLocation Loc, const Token &MacroNameTok,
+                const MacroDefinition &MD) override {
+    setSeenNoTrivialPPDirective();
+  }
+  /// Hook called whenever an \#elifndef is skipped.
+  /// \param Loc the source location of the directive.
+  /// \param ConditionRange The SourceRange of the expression being tested.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  // FIXME: better to pass in a list (or tree!) of Tokens.
+  void Elifndef(SourceLocation Loc, SourceRange ConditionRange,
+                SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#else is seen.
+  /// \param Loc the source location of the directive.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  void Else(SourceLocation Loc, SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+
+  /// Hook called whenever an \#endif is seen.
+  /// \param Loc the source location of the directive.
+  /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive.
+  void Endif(SourceLocation Loc, SourceLocation IfLoc) override {
+    setSeenNoTrivialPPDirective();
+  }
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4d82e20e5d4f..e90564a9739a 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -82,6 +82,7 @@ class PreprocessorLexer;
 class PreprocessorOptions;
 class ScratchBuffer;
 class TargetInfo;
+class NoTrivialPPDirectiveTracer;
 
 namespace Builtin {
 class Context;
@@ -353,6 +354,11 @@ class Preprocessor {
   /// First pp-token source location in current translation unit.
   SourceLocation FirstPPTokenLoc;
 
+  /// A preprocessor directive tracer to trace whether the preprocessing
+  /// state changed. These changes would mean most semantically observable
+  /// preprocessor state, particularly anything that is order dependent.
+  NoTrivialPPDirectiveTracer *DirTracer = nullptr;
+
   /// A position within a C++20 import-seq.
   class StdCXXImportSeq {
   public:
@@ -609,6 +615,8 @@ class Preprocessor {
       return State == NamedModuleImplementation && !getName().contains(':');
     }
 
+    bool isNotAModuleDecl() const { return State == NotAModuleDecl; }
+
     StringRef getName() const {
       assert(isNamedModule() && "Can't get name from a non named module");
       return Name;
@@ -3087,6 +3095,10 @@ class Preprocessor {
   bool setDeserializedSafeBufferOptOutMap(
       const SmallVectorImpl<SourceLocation> &SrcLocSeqs);
 
+  /// Whether we've seen pp-directives which may have changed the preprocessing
+  /// state.
+  bool hasSeenNoTrivialPPDirective() const;
+
 private:
   /// Helper functions to forward lexing to the actual lexer. They all share the
   /// same signature.
diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h
index fc43e72593b9..d9dc5a562d80 100644
--- a/clang/include/clang/Lex/Token.h
+++ b/clang/include/clang/Lex/Token.h
@@ -86,12 +86,12 @@ class Token {
                                 // macro stringizing or charizing operator.
     CommaAfterElided = 0x200, // The comma following this token was elided (MS).
     IsEditorPlaceholder = 0x400, // This identifier is a placeholder.
-
-    IsReinjected = 0x800,  // A phase 4 token that was produced before and
-                           // re-added, e.g. via EnterTokenStream. Annotation
-                           // tokens are *not* reinjected.
-    FirstPPToken = 0x1000, // This token is the first pp token in the
-                           // translation unit.
+    IsReinjected = 0x800,        // A phase 4 token that was produced before and
+                          // re-added, e.g. via EnterTokenStream. Annotation
+                          // tokens are *not* reinjected.
+    HasSeenNoTrivialPPDirective =
+        0x1000, // Whether we've seen any 'no-trivial' pp-directives before
+                // current position.
   };
 
   tok::TokenKind getKind() const { return Kind; }
@@ -321,8 +321,9 @@ class Token {
   /// lexer uses identifier tokens to represent placeholders.
   bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); }
 
-  /// Returns true if this token is the first pp-token.
-  bool isFirstPPToken() const { return getFlag(FirstPPToken); }
+  bool hasSeenNoTrivialPPDirective() const {
+    return getFlag(HasSeenNoTrivialPPDirective);
+  }
 };
 
 /// Information about the conditional stack (\#if directives)
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index 9135ff949eea..a70335bef9dd 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -350,11 +350,6 @@ class Sema;
     LLVM_PREFERRED_TYPE(bool)
     unsigned BindsToRvalue : 1;
 
-    /// Whether this was an identity conversion with qualification
-    /// conversion for the implicit object argument.
-    LLVM_PREFERRED_TYPE(bool)
-    unsigned IsImplicitObjectArgumentQualificationConversion : 1;
-
     /// Whether this binds an implicit object argument to a
     /// non-static member function without a ref-qualifier.
     LLVM_PREFERRED_TYPE(bool)
@@ -453,11 +448,11 @@ class Sema;
 #endif
         return true;
       }
+      if (!C.hasSameType(getFromType(), getToType(2)))
+        return false;
       if (BindsToRvalue && IsLvalueReference)
         return false;
-      if (IsImplicitObjectArgumentQualificationConversion)
-        return C.hasSameUnqualifiedType(getFromType(), getToType(2));
-      return C.hasSameType(getFromType(), getToType(2));
+      return true;
     }
 
     ImplicitConversionRank getRank() const;
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index b331acbe606b..7b0be368d67f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -9836,7 +9836,7 @@ class Sema final : public SemaBase {
                                  SourceLocation ModuleLoc, ModuleDeclKind MDK,
                                  ModuleIdPath Path, ModuleIdPath Partition,
                                  ModuleImportState &ImportState,
-                                 bool IntroducerIsFirstPPToken);
+                                 bool SeenNoTrivialPPDirective);
 
   /// The parser has processed a global-module-fragment declaration that begins
   /// the definition of the global module fragment of the current module unit.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0499a81cd523..92d1b536e474 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3032,7 +3032,7 @@ bool ASTContext::hasUniqueObjectRepresentations(
     return true;
   }
 
-  // All other pointers (except __ptrauth pointers) are unique.
+  // All other pointers are unique.
   if (Ty->isPointerType())
     return !Ty.hasAddressDiscriminatedPointerAuth();
 
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 3aa6b3784410..25d19a3dcd48 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -456,7 +456,9 @@ CheckStructurallyEquivalentAttributes(StructuralEquivalenceContext &Context,
                                       const Decl *D1, const Decl *D2,
                                       const Decl *PrimaryDecl = nullptr) {
   // If either declaration has an attribute on it, we treat the declarations
-  // as not being structurally equivalent.
+  // as not being structurally equivalent unless both declarations are implicit
+  // (ones generated by the compiler like __NSConstantString_tag).
+  //
   // FIXME: this should be handled on a case-by-case basis via tablegen in
   // Attr.td. There are multiple cases to consider: one declaration with the
   // attribute, another without it; different attribute syntax|spellings for
@@ -468,7 +470,7 @@ CheckStructurallyEquivalentAttributes(StructuralEquivalenceContext &Context,
     D1Attr = *D1->getAttrs().begin();
   if (D2->hasAttrs())
     D2Attr = *D2->getAttrs().begin();
-  if (D1Attr || D2Attr) {
+  if ((D1Attr || D2Attr) && !D1->isImplicit() && !D2->isImplicit()) {
     const auto *DiagnoseDecl = cast<TypeDecl>(PrimaryDecl ? PrimaryDecl : D2);
     Context.Diag2(DiagnoseDecl->getLocation(),
                   diag::warn_odr_tag_type_with_attributes)
@@ -873,7 +875,29 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
     else if (T1->getTypeClass() == Type::FunctionNoProto &&
              T2->getTypeClass() == Type::FunctionProto)
       TC = Type::FunctionNoProto;
-    else
+    else if (Context.LangOpts.C23 && !Context.StrictTypeSpelling &&
+             (T1->getTypeClass() == Type::Enum ||
+              T2->getTypeClass() == Type::Enum)) {
+      // In C23, if not being strict about token equivalence, we need to handle
+      // the case where one type is an enumeration and the other type is an
+      // integral type.
+      //
+      // C23 6.7.3.3p16: The enumerated type is compatible with the underlying
+      // type of the enumeration.
+      //
+      // Treat the enumeration as its underlying type and use the builtin type
+      // class comparison.
+      if (T1->getTypeClass() == Type::Enum) {
+        T1 = T1->getAs<EnumType>()->getDecl()->getIntegerType();
+        if (!T2->isBuiltinType() || T1.isNull()) // Sanity check
+          return false;
+      } else if (T2->getTypeClass() == Type::Enum) {
+        T2 = T2->getAs<EnumType>()->getDecl()->getIntegerType();
+        if (!T1->isBuiltinType() || T2.isNull()) // Sanity check
+          return false;
+      }
+      TC = Type::Builtin;
+    } else
       return false;
   }
 
@@ -2067,6 +2091,48 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
       !CheckStructurallyEquivalentAttributes(Context, D1, D2))
     return false;
 
+  // In C23, if one enumeration has a fixed underlying type, the other shall
+  // have a compatible fixed underlying type (6.2.7).
+  if (Context.LangOpts.C23) {
+    if (D1->isFixed() != D2->isFixed()) {
+      if (Context.Complain) {
+        Context.Diag2(D2->getLocation(),
+                      Context.getApplicableDiagnostic(
+                          diag::err_odr_tag_type_inconsistent))
+            << Context.ToCtx.getTypeDeclType(D2)
+            << (&Context.FromCtx != &Context.ToCtx);
+        Context.Diag1(D1->getLocation(),
+                      D1->isFixed()
+                          ? diag::note_odr_fixed_underlying_type
+                          : diag::note_odr_missing_fixed_underlying_type)
+            << D1;
+        Context.Diag2(D2->getLocation(),
+                      D2->isFixed()
+                          ? diag::note_odr_fixed_underlying_type
+                          : diag::note_odr_missing_fixed_underlying_type)
+            << D2;
+      }
+      return false;
+    }
+    if (D1->isFixed()) {
+      assert(D2->isFixed() && "enums expected to have fixed underlying types");
+      if (!IsStructurallyEquivalent(Context, D1->getIntegerType(),
+                                    D2->getIntegerType())) {
+        if (Context.Complain) {
+          Context.Diag2(D2->getLocation(),
+                        Context.getApplicableDiagnostic(
+                            diag::err_odr_tag_type_inconsistent))
+              << Context.ToCtx.getTypeDeclType(D2)
+              << (&Context.FromCtx != &Context.ToCtx);
+          Context.Diag2(D2->getLocation(),
+                        diag::note_odr_incompatible_fixed_underlying_type)
+              << D2 << D2->getIntegerType() << D1->getIntegerType();
+        }
+        return false;
+      }
+    }
+  }
+
   llvm::SmallVector<const EnumConstantDecl *, 8> D1Enums, D2Enums;
   auto CopyEnumerators =
       [](auto &&Range, llvm::SmallVectorImpl<const EnumConstantDecl *> &Cont) {
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 451496500979..64895f4fb9d4 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1448,6 +1448,13 @@ void CXXRecordDecl::addedMember(Decl *D) {
         data().StructuralIfLiteral = false;
     }
 
+    // If this type contains any address discriminated values we should
+    // have already indicated that the only special member functions that
+    // can possibly be trivial are the default constructor and destructor.
+    if (T.hasAddressDiscriminatedPointerAuth())
+      data().HasTrivialSpecialMembers &=
+          SMF_DefaultConstructor | SMF_Destructor;
+
     // C++14 [meta.unary.prop]p4:
     //   T is a class type [...] with [...] no non-static data members other
     //   than subobjects of zero size
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 419dd5dbdc69..bfecae972940 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -4441,7 +4441,8 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
         }
       } else if (!IsAccess) {
         return CompleteObject(LVal.getLValueBase(), nullptr, BaseType);
-      } else if (IsConstant && Info.checkingPotentialConstantExpression() &&
+      } else if ((IsConstant || BaseType->isReferenceType()) &&
+                 Info.checkingPotentialConstantExpression() &&
                  BaseType->isLiteralType(Info.Ctx) && !VD->hasDefinition()) {
         // This variable might end up being constexpr. Don't diagnose it yet.
       } else if (IsConstant) {
@@ -4478,9 +4479,11 @@ static CompleteObject findCompleteObject(EvalInfo &Info, const Expr *E,
     // a null BaseVal. Any constexpr-unknown variable seen here is an error:
     // we can't access a constexpr-unknown object.
     if (!BaseVal) {
-      Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
-          << AK << VD;
-      Info.Note(VD->getLocation(), diag::note_declared_at);
+      if (!Info.checkingPotentialConstantExpression()) {
+        Info.FFDiag(E, diag::note_constexpr_access_unknown_variable, 1)
+            << AK << VD;
+        Info.Note(VD->getLocation(), diag::note_declared_at);
+      }
       return CompleteObject();
     }
   } else if (DynamicAllocLValue DA = LVal.Base.dyn_cast<DynamicAllocLValue>()) {
@@ -7906,8 +7909,9 @@ static bool checkBitCastConstexprEligibilityType(SourceLocation Loc,
       // so its layout is unspecified. For now, we'll simply treat these cases
       // as unsupported (this should only be possible with OpenCL bool vectors
       // whose element count isn't a multiple of the byte size).
-      Info->FFDiag(Loc, diag::note_constexpr_bit_cast_invalid_vector)
-          << QualType(VTy, 0) << EltSize << NElts << Ctx.getCharWidth();
+      if (Info)
+        Info->FFDiag(Loc, diag::note_constexpr_bit_cast_invalid_vector)
+            << QualType(VTy, 0) << EltSize << NElts << Ctx.getCharWidth();
       return false;
     }
 
@@ -7916,8 +7920,9 @@ static bool checkBitCastConstexprEligibilityType(SourceLocation Loc,
       // The layout for x86_fp80 vectors seems to be handled very inconsistently
       // by both clang and LLVM, so for now we won't allow bit_casts involving
       // it in a constexpr context.
-      Info->FFDiag(Loc, diag::note_constexpr_bit_cast_unsupported_type)
-          << EltTy;
+      if (Info)
+        Info->FFDiag(Loc, diag::note_constexpr_bit_cast_unsupported_type)
+            << EltTy;
       return false;
     }
   }
@@ -10929,10 +10934,6 @@ bool RecordExprEvaluator::VisitCXXConstructExpr(const CXXConstructExpr *E,
 
   bool ZeroInit = E->requiresZeroInitialization();
   if (CheckTrivialDefaultConstructor(Info, E->getExprLoc(), FD, ZeroInit)) {
-    // If we've already performed zero-initialization, we're already done.
-    if (Result.hasValue())
-      return true;
-
     if (ZeroInit)
       return ZeroInitialization(E, T);
 
@@ -14478,12 +14479,6 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     if (!EvaluatePointer(E->getRHS(), RHSValue, Info) || !LHSOK)
       return false;
 
-    // If we have Unknown pointers we should fail if they are not global values.
-    if (!(IsGlobalLValue(LHSValue.getLValueBase()) &&
-          IsGlobalLValue(RHSValue.getLValueBase())) &&
-        (LHSValue.AllowConstexprUnknown || RHSValue.AllowConstexprUnknown))
-      return false;
-
     // Reject differing bases from the normal codepath; we special-case
     // comparisons to null.
     if (!HasSameBase(LHSValue, RHSValue)) {
@@ -14545,6 +14540,10 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
           (LHSValue.Base && isZeroSized(RHSValue)))
         return DiagComparison(
             diag::note_constexpr_pointer_comparison_zero_sized);
+      if (LHSValue.AllowConstexprUnknown || RHSValue.AllowConstexprUnknown)
+        return DiagComparison(
+            diag::note_constexpr_pointer_comparison_unspecified);
+      // FIXME: Verify both variables are live.
       return Success(CmpResult::Unequal, E);
     }
 
@@ -14565,7 +14564,9 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     if (!LHSDesignator.Invalid && !RHSDesignator.Invalid && IsRelational) {
       bool WasArrayIndex;
       unsigned Mismatch = FindDesignatorMismatch(
-          getType(LHSValue.Base), LHSDesignator, RHSDesignator, WasArrayIndex);
+          LHSValue.Base.isNull() ? QualType()
+                                 : getType(LHSValue.Base).getNonReferenceType(),
+          LHSDesignator, RHSDesignator, WasArrayIndex);
       // At the point where the designators diverge, the comparison has a
       // specified value if:
       //  - we are comparing array indices
@@ -14609,7 +14610,7 @@ EvaluateComparisonBinaryOperator(EvalInfo &Info, const BinaryOperator *E,
     // compare pointers within the object in question; otherwise, the result
     // depends on where the object is located in memory.
     if (!LHSValue.Base.isNull() && IsRelational) {
-      QualType BaseTy = getType(LHSValue.Base);
+      QualType BaseTy = getType(LHSValue.Base).getNonReferenceType();
       if (BaseTy->isIncompleteType())
         return Error(E);
       CharUnits Size = Info.Ctx.getTypeSizeInChars(BaseTy);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index e5a1ab2ff890..bd3dd983bca7 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2715,6 +2715,11 @@ bool QualType::isCXX98PODType(const ASTContext &Context) const {
     return false;
 
   QualType CanonicalType = getTypePtr()->CanonicalType;
+
+  // Any type that is, or contains, address discriminated data is never POD.
+  if (const_cast<ASTContext&>(Context).containsAddressDiscriminatedPointerAuth(CanonicalType))
+    return false;
+
   switch (CanonicalType->getTypeClass()) {
     // Everything not explicitly mentioned is not POD.
   default:
@@ -2773,6 +2778,11 @@ bool QualType::isTrivialType(const ASTContext &Context) const {
   if (CanonicalType->isDependentType())
     return false;
 
+  // Any type that is, or contains, address discriminated data is never a
+  // trivial type.
+  if (const_cast<ASTContext&>(Context).containsAddressDiscriminatedPointerAuth(CanonicalType))
+    return false;
+
   // C++0x [basic.types]p9:
   //   Scalar types, trivial class types, arrays of such types, and
   //   cv-qualified versions of these types are collectively called trivial
@@ -2870,6 +2880,12 @@ bool QualType::isBitwiseCloneableType(const ASTContext &Context) const {
 
   if (CanonicalType->isIncompleteType())
     return false;
+
+  // Any type that is, or contains, address discriminated data is never
+  // bitwise clonable.
+  if (const_cast<ASTContext&>(Context).containsAddressDiscriminatedPointerAuth(CanonicalType))
+    return false;
+
   const auto *RD = CanonicalType->getAsRecordDecl(); // struct/union/class
   if (!RD)
     return true;
@@ -3115,6 +3131,10 @@ bool QualType::isCXX11PODType(const ASTContext &Context) const {
   if (BaseTy->isIncompleteType())
     return false;
 
+  // Any type that is, or contains, address discriminated data is non-POD.
+  if (const_cast<ASTContext&>(Context).containsAddressDiscriminatedPointerAuth(*this))
+    return false;
+
   // As an extension, Clang treats vector types as Scalar types.
   if (BaseTy->isScalarType() || BaseTy->isVectorType())
     return true;
diff --git a/clang/lib/Analysis/ThreadSafety.cpp b/clang/lib/Analysis/ThreadSafety.cpp
index 80e7c8eff671..dadb0b757a2c 100644
--- a/clang/lib/Analysis/ThreadSafety.cpp
+++ b/clang/lib/Analysis/ThreadSafety.cpp
@@ -1331,7 +1331,7 @@ void ThreadSafetyAnalyzer::addLock(FactSet &FSet,
       FSet.removeLock(FactMan, NegC);
     }
     else {
-      if (inCurrentScope(*Entry) && !Entry->asserted())
+      if (inCurrentScope(*Entry) && !Entry->asserted() && !Entry->reentrant())
         Handler.handleNegativeNotHeld(Entry->getKind(), Entry->toString(),
                                       NegC.toString(), Entry->loc());
     }
diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index af1111a86330..5d11578893c6 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -769,6 +769,9 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     case llvm::Triple::FreeBSD:
       return std::make_unique<FreeBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
                                                                         Opts);
+    case llvm::Triple::OpenBSD:
+      return std::make_unique<OpenBSDTargetInfo<LoongArch64TargetInfo>>(Triple,
+                                                                        Opts);
     default:
       return std::make_unique<LoongArch64TargetInfo>(Triple, Opts);
     }
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index f6915df1520b..8e29bb745734 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -461,6 +461,8 @@ LoongArchTargetInfo::parseTargetAttr(StringRef Features) const {
 
     case AttrFeatureKind::Feature:
       Ret.Features.push_back("+" + Value.str());
+      if (Value == "lasx")
+        Ret.Features.push_back("+lsx");
       break;
     }
   }
diff --git a/clang/lib/Basic/Targets/Mips.h b/clang/lib/Basic/Targets/Mips.h
index 35501ed44ccd..e199df32f56e 100644
--- a/clang/lib/Basic/Targets/Mips.h
+++ b/clang/lib/Basic/Targets/Mips.h
@@ -129,7 +129,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     LongWidth = LongAlign = 32;
     MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
     PointerWidth = PointerAlign = 32;
-    PtrDiffType = SignedInt;
+    PtrDiffType = IntPtrType = SignedInt;
     SizeType = UnsignedInt;
     SuitableAlign = 64;
   }
@@ -155,7 +155,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IntMaxType = Int64Type;
     LongWidth = LongAlign = 64;
     PointerWidth = PointerAlign = 64;
-    PtrDiffType = SignedLong;
+    PtrDiffType = IntPtrType = SignedLong;
     SizeType = UnsignedLong;
   }
 
@@ -165,7 +165,7 @@ class LLVM_LIBRARY_VISIBILITY MipsTargetInfo : public TargetInfo {
     IntMaxType = Int64Type;
     LongWidth = LongAlign = 32;
     PointerWidth = PointerAlign = 32;
-    PtrDiffType = SignedInt;
+    PtrDiffType = IntPtrType = SignedInt;
     SizeType = UnsignedInt;
   }
 
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index 30d861a7ca60..c1a68f464e83 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -174,6 +174,9 @@ class LLVM_LIBRARY_VISIBILITY DragonFlyBSDTargetInfo
     DefineStd(Builder, "unix", Opts);
     if (this->HasFloat128)
       Builder.defineMacro("__FLOAT128__");
+
+    if (Opts.C11)
+      Builder.defineMacro("__STDC_NO_THREADS__");
   }
 
 public:
@@ -496,6 +499,7 @@ class LLVM_LIBRARY_VISIBILITY OpenBSDTargetInfo : public OSTargetInfo<Target> {
     case llvm::Triple::sparcv9:
       this->MCountName = "_mcount";
       break;
+    case llvm::Triple::loongarch64:
     case llvm::Triple::riscv64:
       break;
     }
diff --git a/clang/lib/CodeGen/Address.h b/clang/lib/CodeGen/Address.h
index a748ddaa110a..4e7f3561ac04 100644
--- a/clang/lib/CodeGen/Address.h
+++ b/clang/lib/CodeGen/Address.h
@@ -176,6 +176,11 @@ class Address {
   static Address invalid() { return Address(nullptr); }
   bool isValid() const { return Pointer.getPointer() != nullptr; }
 
+  llvm::Value *getPointerIfNotSigned() const {
+    assert(isValid() && "pointer isn't valid");
+    return !isSigned() ? Pointer.getPointer() : nullptr;
+  }
+
   /// This function is used in situations where the caller is doing some sort of
   /// opaque "laundering" of the pointer.
   void replaceBasePointer(llvm::Value *P) {
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 1aba841eb5fc..c1f5b983f723 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -188,13 +188,14 @@ static llvm::Constant *buildBlockDescriptor(CodeGenModule &CGM,
   // Optional copy/dispose helpers.
   bool hasInternalHelper = false;
   if (blockInfo.NeedsCopyDispose) {
+    auto &Schema = CGM.getCodeGenOpts().PointerAuth.BlockHelperFunctionPointers;
     // copy_func_helper_decl
     llvm::Constant *copyHelper = buildCopyHelper(CGM, blockInfo);
-    elements.add(copyHelper);
+    elements.addSignedPointer(copyHelper, Schema, GlobalDecl(), QualType());
 
     // destroy_func_decl
     llvm::Constant *disposeHelper = buildDisposeHelper(CGM, blockInfo);
-    elements.add(disposeHelper);
+    elements.addSignedPointer(disposeHelper, Schema, GlobalDecl(), QualType());
 
     if (cast<llvm::Function>(copyHelper->stripPointerCasts())
             ->hasInternalLinkage() ||
@@ -567,9 +568,8 @@ static void computeBlockInfo(CodeGenModule &CGM, CodeGenFunction *CGF,
       llvm::StructType::get(CGM.getLLVMContext(), elementTypes, true);
     info.CanBeGlobal = true;
     return;
-  }
-  else if (C.getLangOpts().ObjC &&
-           CGM.getLangOpts().getGC() == LangOptions::NonGC)
+  } else if (C.getLangOpts().ObjC &&
+             CGM.getLangOpts().getGC() == LangOptions::NonGC)
     info.HasCapturedVariableLayout = true;
 
   if (block->doesNotEscape())
@@ -783,7 +783,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const BlockExpr *blockExpr) {
 
 llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
   bool IsOpenCL = CGM.getContext().getLangOpts().OpenCL;
-  auto GenVoidPtrTy =
+  llvm::PointerType *GenVoidPtrTy =
       IsOpenCL ? CGM.getOpenCLRuntime().getGenericVoidPointerType() : VoidPtrTy;
   LangAS GenVoidPtrAddr = IsOpenCL ? LangAS::opencl_generic : LangAS::Default;
   auto GenVoidPtrSize = CharUnits::fromQuantity(
@@ -817,9 +817,6 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                                    : CGM.getNSConcreteStackBlock();
     isa = blockISA;
 
-    // Build the block descriptor.
-    descriptor = buildBlockDescriptor(CGM, blockInfo);
-
     // Compute the initial on-stack block flags.
     if (!CGM.getCodeGenOpts().DisableBlockSignatureString)
       flags = BLOCK_HAS_SIGNATURE;
@@ -833,6 +830,9 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
       flags |= BLOCK_USE_STRET;
     if (blockInfo.NoEscape)
       flags |= BLOCK_IS_NOESCAPE | BLOCK_IS_GLOBAL;
+
+    // Build the block descriptor.
+    descriptor = buildBlockDescriptor(CGM, blockInfo);
   }
 
   auto projectField = [&](unsigned index, const Twine &name) -> Address {
@@ -883,11 +883,25 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
           llvm::ConstantInt::get(IntTy, blockInfo.BlockAlign.getQuantity()),
           getIntSize(), "block.align");
     }
-    addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
-    if (!IsOpenCL)
-      addHeaderField(descriptor, getPointerSize(), "block.descriptor");
-    else if (auto *Helper =
-                 CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
+
+    if (!IsOpenCL) {
+      llvm::Value *blockFnPtr =
+          llvm::ConstantExpr::getBitCast(InvokeFn, VoidPtrTy);
+      QualType type = blockInfo.getBlockExpr()
+                          ->getType()
+                          ->castAs<BlockPointerType>()
+                          ->getPointeeType();
+      addSignedHeaderField(
+          blockFnPtr,
+          CGM.getCodeGenOpts().PointerAuth.BlockInvocationFunctionPointers,
+          GlobalDecl(), type, getPointerSize(), "block.invoke");
+
+      addSignedHeaderField(
+          descriptor, CGM.getCodeGenOpts().PointerAuth.BlockDescriptorPointers,
+          GlobalDecl(), type, getPointerSize(), "block.descriptor");
+    } else if (auto *Helper =
+                   CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
+      addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
       for (auto I : Helper->getCustomFieldValues(*this, blockInfo)) {
         addHeaderField(
             I.first,
@@ -895,7 +909,8 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
                 CGM.getDataLayout().getTypeAllocSize(I.first->getType())),
             I.second);
       }
-    }
+    } else
+      addHeaderField(blockFn, GenVoidPtrSize, "block.invoke");
   }
 
   // Finally, capture all the values into the block.
@@ -1166,6 +1181,8 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
   ASTContext &Ctx = getContext();
   CallArgList Args;
 
+  llvm::Value *FuncPtr = nullptr;
+
   if (getLangOpts().OpenCL) {
     // For OpenCL, BlockPtr is already casted to generic block literal.
 
@@ -1185,7 +1202,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     if (!isa<ParmVarDecl>(E->getCalleeDecl()))
       Func = CGM.getOpenCLRuntime().getInvokeFunction(E->getCallee());
     else {
-      llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
+      FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 2);
       Func = Builder.CreateAlignedLoad(GenericVoidPtrTy, FuncPtr,
                                        getPointerAlign());
     }
@@ -1194,7 +1211,7 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     BlockPtr =
         Builder.CreatePointerCast(BlockPtr, UnqualPtrTy, "block.literal");
     // Get pointer to the block invoke function
-    llvm::Value *FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
+    FuncPtr = Builder.CreateStructGEP(GenBlockTy, BlockPtr, 3);
 
     // First argument is a block literal casted to a void pointer
     BlockPtr = Builder.CreatePointerCast(BlockPtr, VoidPtrTy);
@@ -1211,7 +1228,15 @@ RValue CodeGenFunction::EmitBlockCallExpr(const CallExpr *E,
     CGM.getTypes().arrangeBlockFunctionCall(Args, FuncTy);
 
   // Prepare the callee.
-  CGCallee Callee(CGCalleeInfo(), Func);
+  CGPointerAuthInfo PointerAuth;
+  if (auto &AuthSchema =
+          CGM.getCodeGenOpts().PointerAuth.BlockInvocationFunctionPointers) {
+    assert(FuncPtr != nullptr && "Missing function pointer for AuthInfo");
+    PointerAuth =
+        EmitPointerAuthInfo(AuthSchema, FuncPtr, GlobalDecl(), FnType);
+  }
+
+  CGCallee Callee(CGCalleeInfo(), Func, PointerAuth);
 
   // And call the block.
   return EmitCall(FnInfo, Callee, ReturnValue, Args, CallOrInvoke);
@@ -1295,14 +1320,15 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
 
   bool IsOpenCL = CGM.getLangOpts().OpenCL;
   bool IsWindows = CGM.getTarget().getTriple().isOSWindows();
+  auto &CGOPointerAuth = CGM.getCodeGenOpts().PointerAuth;
   if (!IsOpenCL) {
     // isa
     if (IsWindows)
       fields.addNullPointer(CGM.Int8PtrPtrTy);
     else
       fields.addSignedPointer(CGM.getNSConcreteGlobalBlock(),
-                              CGM.getCodeGenOpts().PointerAuth.ObjCIsaPointers,
-                              GlobalDecl(), QualType());
+                              CGOPointerAuth.ObjCIsaPointers, GlobalDecl(),
+                              QualType());
 
     // __flags
     BlockFlags flags = BLOCK_IS_GLOBAL;
@@ -1321,11 +1347,20 @@ static llvm::Constant *buildGlobalBlock(CodeGenModule &CGM,
   }
 
   // Function
-  fields.add(blockFn);
+  if (auto &Schema = CGOPointerAuth.BlockInvocationFunctionPointers) {
+    QualType FnType = blockInfo.getBlockExpr()
+                          ->getType()
+                          ->castAs<BlockPointerType>()
+                          ->getPointeeType();
+    fields.addSignedPointer(blockFn, Schema, GlobalDecl(), FnType);
+  } else
+    fields.add(blockFn);
 
   if (!IsOpenCL) {
     // Descriptor
-    fields.add(buildBlockDescriptor(CGM, blockInfo));
+    llvm::Constant *Descriptor = buildBlockDescriptor(CGM, blockInfo);
+    fields.addSignedPointer(Descriptor, CGOPointerAuth.BlockDescriptorPointers,
+                            GlobalDecl(), QualType());
   } else if (auto *Helper =
                  CGM.getTargetCodeGenInfo().getTargetOpenCLBlockHelper()) {
     for (auto *I : Helper->getCustomFieldValues(CGM, blockInfo)) {
@@ -1995,8 +2030,8 @@ CodeGenFunction::GenerateCopyHelperFunction(const CGBlockInfo &blockInfo) {
         // it. It's not quite worth the annoyance to avoid creating it in the
         // first place.
         if (!needsEHCleanup(captureType.isDestructedType()))
-          if (auto *I =
-                  cast_or_null<llvm::Instruction>(dstField.getBasePointer()))
+          if (auto *I = cast_or_null<llvm::Instruction>(
+                  dstField.getPointerIfNotSigned()))
             I->eraseFromParent();
       }
       break;
@@ -2730,8 +2765,16 @@ void CodeGenFunction::emitByrefStructureInit(const AutoVarEmission &emission) {
   unsigned nextHeaderIndex = 0;
   CharUnits nextHeaderOffset;
   auto storeHeaderField = [&](llvm::Value *value, CharUnits fieldSize,
-                              const Twine &name) {
+                              const Twine &name, bool isFunction = false) {
     auto fieldAddr = Builder.CreateStructGEP(addr, nextHeaderIndex, name);
+    if (isFunction) {
+      if (auto &Schema = CGM.getCodeGenOpts()
+                             .PointerAuth.BlockByrefHelperFunctionPointers) {
+        auto PointerAuth = EmitPointerAuthInfo(
+            Schema, fieldAddr.emitRawPointer(*this), GlobalDecl(), QualType());
+        value = EmitPointerAuthSign(PointerAuth, value);
+      }
+    }
     Builder.CreateStore(value, fieldAddr);
 
     nextHeaderIndex++;
@@ -2814,10 +2857,10 @@ void CodeGenFunction::emitByrefStructureInit(const AutoVarEmission &emission) {
   storeHeaderField(V, getIntSize(), "byref.size");
 
   if (helpers) {
-    storeHeaderField(helpers->CopyHelper, getPointerSize(),
-                     "byref.copyHelper");
+    storeHeaderField(helpers->CopyHelper, getPointerSize(), "byref.copyHelper",
+                     /*isFunction=*/true);
     storeHeaderField(helpers->DisposeHelper, getPointerSize(),
-                     "byref.disposeHelper");
+                     "byref.disposeHelper", /*isFunction=*/true);
   }
 
   if (ByRefHasLifetime && HasByrefExtendedLayout) {
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index c8c3d6b20c49..5344a9710d64 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -4787,19 +4787,6 @@ struct DestroyUnpassedArg final : EHScopeStack::Cleanup {
   }
 };
 
-struct DisableDebugLocationUpdates {
-  CodeGenFunction &CGF;
-  bool disabledDebugInfo;
-  DisableDebugLocationUpdates(CodeGenFunction &CGF, const Expr *E) : CGF(CGF) {
-    if ((disabledDebugInfo = isa<CXXDefaultArgExpr>(E) && CGF.getDebugInfo()))
-      CGF.disableDebugInfo();
-  }
-  ~DisableDebugLocationUpdates() {
-    if (disabledDebugInfo)
-      CGF.enableDebugInfo();
-  }
-};
-
 } // end anonymous namespace
 
 RValue CallArg::getRValue(CodeGenFunction &CGF) const {
@@ -4836,7 +4823,9 @@ void CodeGenFunction::EmitWritebacks(const CallArgList &args) {
 
 void CodeGenFunction::EmitCallArg(CallArgList &args, const Expr *E,
                                   QualType type) {
-  DisableDebugLocationUpdates Dis(*this, E);
+  std::optional<DisableDebugLocationUpdates> Dis;
+  if (isa<CXXDefaultArgExpr>(E))
+    Dis.emplace(*this);
   if (const ObjCIndirectCopyRestoreExpr *CRE =
           dyn_cast<ObjCIndirectCopyRestoreExpr>(E)) {
     assert(getLangOpts().ObjCAutoRefCount);
@@ -6229,3 +6218,12 @@ RValue CodeGenFunction::EmitVAArg(VAArgExpr *VE, Address &VAListAddr,
     return CGM.getABIInfo().EmitMSVAArg(*this, VAListAddr, Ty, Slot);
   return CGM.getABIInfo().EmitVAArg(*this, VAListAddr, Ty, Slot);
 }
+
+DisableDebugLocationUpdates::DisableDebugLocationUpdates(CodeGenFunction &CGF)
+    : CGF(CGF) {
+  CGF.disableDebugInfo();
+}
+
+DisableDebugLocationUpdates::~DisableDebugLocationUpdates() {
+  CGF.enableDebugInfo();
+}
diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h
index 0b4e3f9cb036..17b2ce558f71 100644
--- a/clang/lib/CodeGen/CGCall.h
+++ b/clang/lib/CodeGen/CGCall.h
@@ -457,6 +457,12 @@ inline FnInfoOpts &operator&=(FnInfoOpts &A, FnInfoOpts B) {
   return A;
 }
 
+struct DisableDebugLocationUpdates {
+  CodeGenFunction &CGF;
+  DisableDebugLocationUpdates(CodeGenFunction &CGF);
+  ~DisableDebugLocationUpdates();
+};
+
 } // end namespace CodeGen
 } // end namespace clang
 
diff --git a/clang/lib/CodeGen/CGCoroutine.cpp b/clang/lib/CodeGen/CGCoroutine.cpp
index 0fc488e98aaf..117ef3d16e21 100644
--- a/clang/lib/CodeGen/CGCoroutine.cpp
+++ b/clang/lib/CodeGen/CGCoroutine.cpp
@@ -707,11 +707,15 @@ struct GetReturnObjectManager {
     Builder.CreateStore(Builder.getFalse(), GroActiveFlag);
 
     GroEmission = CGF.EmitAutoVarAlloca(*GroVarDecl);
-    auto *GroAlloca = dyn_cast_or_null<llvm::AllocaInst>(
-        GroEmission.getOriginalAllocatedAddress().getPointer());
-    assert(GroAlloca && "expected alloca to be emitted");
-    GroAlloca->setMetadata(llvm::LLVMContext::MD_coro_outside_frame,
-                           llvm::MDNode::get(CGF.CGM.getLLVMContext(), {}));
+
+    if (!GroVarDecl->isNRVOVariable()) {
+      // NRVO variables don't have allocas and won't have the same issue.
+      auto *GroAlloca = dyn_cast_or_null<llvm::AllocaInst>(
+          GroEmission.getOriginalAllocatedAddress().getPointer());
+      assert(GroAlloca && "expected alloca to be emitted");
+      GroAlloca->setMetadata(llvm::LLVMContext::MD_coro_outside_frame,
+                             llvm::MDNode::get(CGF.CGM.getLLVMContext(), {}));
+    }
 
     // Remember the top of EHStack before emitting the cleanup.
     auto old_top = CGF.EHStack.stable_begin();
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index f97c7b644598..ec28bd259e8e 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -170,6 +170,10 @@ void CGDebugInfo::addInstToSpecificSourceAtom(llvm::Instruction *KeyInstruction,
   if (!Group || !CGM.getCodeGenOpts().DebugKeyInstructions)
     return;
 
+  llvm::DISubprogram *SP = KeyInstruction->getFunction()->getSubprogram();
+  if (!SP || !SP->getKeyInstructionsEnabled())
+    return;
+
   addInstSourceAtomMetadata(KeyInstruction, Group, /*Rank=*/1);
 
   llvm::Instruction *BackupI =
@@ -2641,7 +2645,8 @@ StringRef CGDebugInfo::getVTableName(const CXXRecordDecl *RD) {
 // existing information in the DWARF. The type is assumed to be 'void *'.
 void CGDebugInfo::emitVTableSymbol(llvm::GlobalVariable *VTable,
                                    const CXXRecordDecl *RD) {
-  if (!CGM.getTarget().getCXXABI().isItaniumFamily())
+  if (!CGM.getTarget().getCXXABI().isItaniumFamily() ||
+      CGM.getTarget().getTriple().isOSBinFormatCOFF())
     return;
 
   ASTContext &Context = CGM.getContext();
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 85c768807572..b5debd93b0f6 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -3314,7 +3314,14 @@ LValue CodeGenFunction::EmitDeclRefLValue(const DeclRefExpr *E) {
       auto *FD = LambdaCaptureFields.lookup(BD);
       return EmitCapturedFieldLValue(*this, FD, CXXABIThisValue);
     }
-    return EmitLValue(BD->getBinding());
+    // Suppress debug location updates when visiting the binding, since the
+    // binding may emit instructions that would otherwise be associated with the
+    // binding itself, rather than the expression referencing the binding. (this
+    // leads to jumpy debug stepping behavior where the location/debugger jump
+    // back to the binding declaration, then back to the expression referencing
+    // the binding)
+    DisableDebugLocationUpdates D(*this);
+    return EmitLValue(BD->getBinding(), NotKnownNonNull);
   }
 
   // We can form DeclRefExprs naming GUID declarations when reconstituting
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 359e30cb8f5c..912b1d72c7e2 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -2313,7 +2313,8 @@ llvm::Value *CodeGenFunction::EmitDynamicCast(Address ThisAddr,
   bool IsExact = !IsDynamicCastToVoid &&
                  CGM.getCodeGenOpts().OptimizationLevel > 0 &&
                  DestRecordTy->getAsCXXRecordDecl()->isEffectivelyFinal() &&
-                 CGM.getCXXABI().shouldEmitExactDynamicCast(DestRecordTy);
+                 CGM.getCXXABI().shouldEmitExactDynamicCast(DestRecordTy) &&
+                 !getLangOpts().PointerAuthCalls;
 
   // C++ [expr.dynamic.cast]p4:
   //   If the value of v is a null pointer value in the pointer case, the result
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 715bd392f59f..0cc468ca3ab7 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -873,8 +873,9 @@ bool ConstStructBuilder::Build(const APValue &Val, const RecordDecl *RD,
 
     for (const BaseInfo &Base : Bases) {
       bool IsPrimaryBase = Layout.getPrimaryBase() == Base.Decl;
-      Build(Val.getStructBase(Base.Index), Base.Decl, IsPrimaryBase,
-            VTableClass, Offset + Base.Offset);
+      if (!Build(Val.getStructBase(Base.Index), Base.Decl, IsPrimaryBase,
+                 VTableClass, Offset + Base.Offset))
+        return false;
     }
   }
 
@@ -1620,7 +1621,7 @@ llvm::Constant *ConstantEmitter::tryEmitConstantExpr(const ConstantExpr *CE) {
   if (CE->isGLValue())
     RetType = CGM.getContext().getLValueReferenceType(RetType);
 
-  return emitAbstract(CE->getBeginLoc(), CE->getAPValueResult(), RetType);
+  return tryEmitAbstract(CE->getAPValueResult(), RetType);
 }
 
 llvm::Constant *
diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp
index e0650067b954..1a8c6f015bda 100644
--- a/clang/lib/CodeGen/CGStmt.cpp
+++ b/clang/lib/CodeGen/CGStmt.cpp
@@ -846,11 +846,13 @@ void CodeGenFunction::EmitGotoStmt(const GotoStmt &S) {
   if (HaveInsertPoint())
     EmitStopPoint(&S);
 
+  ApplyAtomGroup Grp(getDebugInfo());
   EmitBranchThroughCleanup(getJumpDestForLabel(S.getLabel()));
 }
 
 
 void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
+  ApplyAtomGroup Grp(getDebugInfo());
   if (const LabelDecl *Target = S.getConstantTarget()) {
     EmitBranchThroughCleanup(getJumpDestForLabel(Target));
     return;
@@ -869,6 +871,8 @@ void CodeGenFunction::EmitIndirectGotoStmt(const IndirectGotoStmt &S) {
   cast<llvm::PHINode>(IndGotoBB->begin())->addIncoming(V, CurBB);
 
   EmitBranch(IndGotoBB);
+  if (CurBB && CurBB->getTerminator())
+    addInstToCurrentSourceAtom(CurBB->getTerminator(), nullptr);
 }
 
 void CodeGenFunction::EmitIfStmt(const IfStmt &S) {
@@ -2672,6 +2676,9 @@ static void UpdateAsmCallInst(llvm::CallBase &Result, bool HasSideEffect,
                                          llvm::ConstantAsMetadata::get(Loc)));
   }
 
+  // Make inline-asm calls Key for the debug info feature Key Instructions.
+  CGF.addInstToNewSourceAtom(&Result, nullptr);
+
   if (!NoConvergent && CGF.getLangOpts().assumeFunctionsAreConvergent())
     // Conservatively, mark all inline asm blocks in CUDA or OpenCL as
     // convergent (meaning, they may call an intrinsically convergent op, such
@@ -2750,6 +2757,7 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
       }
     }
 
+    ApplyAtomGroup Grp(CGF.getDebugInfo());
     LValue Dest = ResultRegDests[i];
     // ResultTypeRequiresCast elements correspond to the first
     // ResultTypeRequiresCast.size() elements of RegResults.
@@ -2757,7 +2765,8 @@ EmitAsmStores(CodeGenFunction &CGF, const AsmStmt &S,
       unsigned Size = CGF.getContext().getTypeSize(ResultRegQualTys[i]);
       Address A = Dest.getAddress().withElementType(ResultRegTypes[i]);
       if (CGF.getTargetHooks().isScalarizableAsmOperand(CGF, TruncTy)) {
-        Builder.CreateStore(Tmp, A);
+        llvm::StoreInst *S = Builder.CreateStore(Tmp, A);
+        CGF.addInstToCurrentSourceAtom(S, S->getValueOperand());
         continue;
       }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 6c32c98cec01..f5ac9f387c64 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -727,7 +727,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   };
 
   /// Header for data within LifetimeExtendedCleanupStack.
-  struct LifetimeExtendedCleanupHeader {
+  struct alignas(uint64_t) LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
     unsigned Size;
     /// The kind of cleanup to push.
@@ -949,7 +949,8 @@ class CodeGenFunction : public CodeGenTypeCache {
         LifetimeExtendedCleanupStack.size() + sizeof(Header) + Header.Size +
         (Header.IsConditional ? sizeof(ActiveFlag) : 0));
 
-    static_assert(sizeof(Header) % alignof(T) == 0,
+    static_assert((alignof(LifetimeExtendedCleanupHeader) == alignof(T)) &&
+                      (alignof(T) == alignof(RawAddress)),
                   "Cleanup will be allocated on misaligned address");
     char *Buffer = &LifetimeExtendedCleanupStack[OldSize];
     new (Buffer) LifetimeExtendedCleanupHeader(Header);
diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h
index ed11dc2bb05d..54f6ceaa52b9 100644
--- a/clang/lib/CodeGen/EHScopeStack.h
+++ b/clang/lib/CodeGen/EHScopeStack.h
@@ -143,7 +143,7 @@ class EHScopeStack {
   ///
   /// Cleanup implementations should generally be declared in an
   /// anonymous namespace.
-  class Cleanup {
+  class alignas(uint64_t) Cleanup {
     // Anchor the construction vtable.
     virtual void anchor();
 
diff --git a/clang/lib/Driver/Multilib.cpp b/clang/lib/Driver/Multilib.cpp
index 87fa1af54a8e..2faf33fb47c9 100644
--- a/clang/lib/Driver/Multilib.cpp
+++ b/clang/lib/Driver/Multilib.cpp
@@ -29,9 +29,15 @@ using namespace llvm::sys;
 
 Multilib::Multilib(StringRef GCCSuffix, StringRef OSSuffix,
                    StringRef IncludeSuffix, const flags_list &Flags,
+                   // Downstream issue: #446 (Extend the Multilib system to
+                   // support an IncludeDirs field)
+                   const includedirs_list &IncludeDirs,
                    StringRef ExclusiveGroup, std::optional<StringRef> Error)
     : GCCSuffix(GCCSuffix), OSSuffix(OSSuffix), IncludeSuffix(IncludeSuffix),
-      Flags(Flags), ExclusiveGroup(ExclusiveGroup), Error(Error) {
+      Flags(Flags),
+      // Downstream issue: #446 (Extend the Multilib system to support an
+      // IncludeDirs field)
+      IncludeDirs(IncludeDirs), ExclusiveGroup(ExclusiveGroup), Error(Error) {
   assert(GCCSuffix.empty() ||
          (StringRef(GCCSuffix).front() == '/' && GCCSuffix.size() > 1));
   assert(OSSuffix.empty() ||
@@ -299,6 +305,9 @@ struct MultilibSerialization {
   std::string Dir;        // if this record successfully selects a library dir
   std::string Error;      // if this record reports a fatal error message
   std::vector<std::string> Flags;
+  // Downstream issue: #446 (Extend the Multilib system to support an
+  // IncludeDirs field)
+  std::vector<std::string> IncludeDirs;
   std::string Group;
 };
 
@@ -350,6 +359,9 @@ template <> struct llvm::yaml::MappingTraits<MultilibSerialization> {
     io.mapOptional("Dir", V.Dir);
     io.mapOptional("Error", V.Error);
     io.mapRequired("Flags", V.Flags);
+    // Downstream issue: #446 (Extend the Multilib system to support an
+    // IncludeDirs field)
+    io.mapOptional("IncludeDirs", V.IncludeDirs);
     io.mapOptional("Group", V.Group);
   }
   static std::string validate(IO &io, MultilibSerialization &V) {
@@ -359,6 +371,12 @@ template <> struct llvm::yaml::MappingTraits<MultilibSerialization> {
       return "the 'Dir' and 'Error' keys may not both be specified";
     if (StringRef(V.Dir).starts_with("/"))
       return "paths must be relative but \"" + V.Dir + "\" starts with \"/\"";
+    // Downstream issue: #446 (Extend the Multilib system to support an
+    // IncludeDirs field)
+    for (const auto &Path : V.IncludeDirs) {
+      if (StringRef(Path).starts_with("/"))
+        return "paths must be relative but \"" + Path + "\" starts with \"/\"";
+    }
     return std::string{};
   }
 };
@@ -489,7 +507,10 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
   Multilibs.reserve(MS.Multilibs.size());
   for (const auto &M : MS.Multilibs) {
     if (!M.Error.empty()) {
-      Multilibs.emplace_back("", "", "", M.Flags, M.Group, M.Error);
+      // Downstream issue: #446 (Extend the Multilib system to support an
+      // IncludeDirs field)
+      Multilibs.emplace_back("", "", "", M.Flags, M.IncludeDirs, M.Group,
+                             M.Error);
     } else {
       std::string Dir;
       if (M.Dir != ".")
@@ -498,7 +519,9 @@ MultilibSet::parseYaml(llvm::MemoryBufferRef Input,
       // Multilib constructor. If we later support more than one type of group,
       // we'll have to look up the group name in MS.Groups, check its type, and
       // decide what to do here.
-      Multilibs.emplace_back(Dir, Dir, Dir, M.Flags, M.Group);
+      // Downstream issue: #446 (Extend the Multilib system to support an
+      // IncludeDirs field)
+      Multilibs.emplace_back(Dir, Dir, Dir, M.Flags, M.IncludeDirs, M.Group);
     }
   }
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 3f9b808b2722..7f79f78bd719 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -255,6 +255,20 @@ static void getAArch64MultilibFlags(const Driver &D,
     Result.push_back(ABIArg->getAsString(Args));
   }
 
+  // Start downstream change. Issue #528
+  if (const Arg *A = Args.getLastArg(options::OPT_O_Group);
+      A && A->getOption().matches(options::OPT_O)) {
+    switch (A->getValue()[0]) {
+    case 's':
+      Result.push_back("-Os");
+      break;
+    case 'z':
+      Result.push_back("-Oz");
+      break;
+    }
+  }
+  // End downstream change. Issue #528
+
   processMultilibCustomFlags(Result, Args);
 }
 
@@ -320,6 +334,21 @@ static void getARMMultilibFlags(const Driver &D,
     if (Endian->getOption().matches(options::OPT_mbig_endian))
       Result.push_back(Endian->getAsString(Args));
   }
+
+  // Start downstream change. Issue #528
+  if (const Arg *A = Args.getLastArg(options::OPT_O_Group);
+      A && A->getOption().matches(options::OPT_O)) {
+    switch (A->getValue()[0]) {
+    case 's':
+      Result.push_back("-Os");
+      break;
+    case 'z':
+      Result.push_back("-Oz");
+      break;
+    }
+  }
+  // End downstream change. Issue #528
+
   processMultilibCustomFlags(Result, Args);
 }
 
@@ -837,17 +866,30 @@ void ToolChain::addFortranRuntimeLibs(const ArgList &Args,
 
 void ToolChain::addFortranRuntimeLibraryPath(const llvm::opt::ArgList &Args,
                                              ArgStringList &CmdArgs) const {
-  // Default to the <driver-path>/../lib directory. This works fine on the
-  // platforms that we have tested so far. We will probably have to re-fine
-  // this in the future. In particular, on some platforms, we may need to use
-  // lib64 instead of lib.
+  auto AddLibSearchPathIfExists = [&](const Twine &Path) {
+    // Linker may emit warnings about non-existing directories
+    if (!llvm::sys::fs::is_directory(Path))
+      return;
+
+    if (getTriple().isKnownWindowsMSVCEnvironment())
+      CmdArgs.push_back(Args.MakeArgString("-libpath:" + Path));
+    else
+      CmdArgs.push_back(Args.MakeArgString("-L" + Path));
+  };
+
+  // Search for flang_rt.* at the same location as clang_rt.* with
+  // LLVM_ENABLE_PER_TARGET_RUNTIME_DIR=0. On most platforms, flang_rt is
+  // located at the path returned by getRuntimePath() which is already added to
+  // the library search path. This exception is for Apple-Darwin.
+  AddLibSearchPathIfExists(getCompilerRTPath());
+
+  // Fall back to the non-resource directory <driver-path>/../lib. We will
+  // probably have to refine this in the future. In particular, on some
+  // platforms, we may need to use lib64 instead of lib.
   SmallString<256> DefaultLibPath =
       llvm::sys::path::parent_path(getDriver().Dir);
   llvm::sys::path::append(DefaultLibPath, "lib");
-  if (getTriple().isKnownWindowsMSVCEnvironment())
-    CmdArgs.push_back(Args.MakeArgString("-libpath:" + DefaultLibPath));
-  else
-    CmdArgs.push_back(Args.MakeArgString("-L" + DefaultLibPath));
+  AddLibSearchPathIfExists(DefaultLibPath);
 }
 
 void ToolChain::addFlangRTLibPath(const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 6bd710ec6b8b..418f9fd9ca4c 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -467,3 +467,18 @@ void aarch64::setPAuthABIInTriple(const Driver &D, const ArgList &Args,
     break;
   }
 }
+
+/// Is the triple {aarch64.aarch64_be}-none-elf?
+bool aarch64::isAArch64BareMetal(const llvm::Triple &Triple) {
+  if (Triple.getArch() != llvm::Triple::aarch64 &&
+      Triple.getArch() != llvm::Triple::aarch64_be)
+    return false;
+
+  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
+    return false;
+
+  if (Triple.getOS() != llvm::Triple::UnknownOS)
+    return false;
+
+  return Triple.getEnvironmentName() == "elf";
+}
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.h b/clang/lib/Driver/ToolChains/Arch/AArch64.h
index 2057272867a1..2765ee8c3a6a 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.h
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.h
@@ -30,6 +30,7 @@ std::string getAArch64TargetCPU(const llvm::opt::ArgList &Args,
 
 void setPAuthABIInTriple(const Driver &D, const llvm::opt::ArgList &Args,
                          llvm::Triple &triple);
+bool isAArch64BareMetal(const llvm::Triple &Triple);
 
 } // end namespace aarch64
 } // end namespace target
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
index 9595ee8383c8..94a94f1e9c48 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.cpp
@@ -23,7 +23,9 @@ const char *sparc::getSparcAsmModeForCPU(StringRef Name,
   if (Triple.getArch() == llvm::Triple::sparcv9) {
     const char *DefV9CPU;
 
-    if (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD())
+    if (Triple.isOSSolaris())
+      DefV9CPU = "-Av9b";
+    else if (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD())
       DefV9CPU = "-Av9a";
     else
       DefV9CPU = "-Av9";
@@ -35,6 +37,13 @@ const char *sparc::getSparcAsmModeForCPU(StringRef Name,
         .Case("niagara4", "-Av9d")
         .Default(DefV9CPU);
   } else {
+    const char *DefV8CPU;
+
+    if (Triple.isOSSolaris())
+      DefV8CPU = "-Av8plus";
+    else
+      DefV8CPU = "-Av8";
+
     return llvm::StringSwitch<const char *>(Name)
         .Case("v8", "-Av8")
         .Case("supersparc", "-Av8")
@@ -70,7 +79,7 @@ const char *sparc::getSparcAsmModeForCPU(StringRef Name,
         .Case("gr712rc", "-Aleon")
         .Case("leon4", "-Aleon")
         .Case("gr740", "-Aleon")
-        .Default("-Av8");
+        .Default(DefV8CPU);
   }
 }
 
@@ -130,7 +139,8 @@ std::string sparc::getSparcTargetCPU(const Driver &D, const ArgList &Args,
   return "";
 }
 
-void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
+void sparc::getSparcTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+                                   const ArgList &Args,
                                    std::vector<StringRef> &Features) {
   sparc::FloatABI FloatABI = sparc::getSparcFloatABI(D, Args);
   if (FloatABI == sparc::FloatABI::Soft)
@@ -150,11 +160,22 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
       Features.push_back("-popc");
   }
 
+  // Those OSes default to enabling VIS on 64-bit SPARC.
+  // See also the corresponding code for external assemblers in
+  // sparc::getSparcAsmModeForCPU().
+  bool IsSparcV9ATarget =
+      (Triple.getArch() == llvm::Triple::sparcv9) &&
+      (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD());
+  bool IsSparcV9BTarget = Triple.isOSSolaris();
+  bool IsSparcV8PlusTarget =
+      Triple.getArch() == llvm::Triple::sparc && Triple.isOSSolaris();
   if (Arg *A = Args.getLastArg(options::OPT_mvis, options::OPT_mno_vis)) {
     if (A->getOption().matches(options::OPT_mvis))
       Features.push_back("+vis");
     else
       Features.push_back("-vis");
+  } else if (IsSparcV9ATarget) {
+    Features.push_back("+vis");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mvis2, options::OPT_mno_vis2)) {
@@ -162,6 +183,8 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
       Features.push_back("+vis2");
     else
       Features.push_back("-vis2");
+  } else if (IsSparcV9BTarget) {
+    Features.push_back("+vis2");
   }
 
   if (Arg *A = Args.getLastArg(options::OPT_mvis3, options::OPT_mno_vis3)) {
@@ -182,6 +205,8 @@ void sparc::getSparcTargetFeatures(const Driver &D, const ArgList &Args,
   if (Arg *A = Args.getLastArg(options::OPT_mv8plus, options::OPT_mno_v8plus)) {
     if (A->getOption().matches(options::OPT_mv8plus))
       Features.push_back("+v8plus");
+  } else if (IsSparcV8PlusTarget) {
+    Features.push_back("+v8plus");
   }
 
   if (Args.hasArg(options::OPT_ffixed_g1))
diff --git a/clang/lib/Driver/ToolChains/Arch/Sparc.h b/clang/lib/Driver/ToolChains/Arch/Sparc.h
index 2b178d9df1ee..fa25b4992cc8 100644
--- a/clang/lib/Driver/ToolChains/Arch/Sparc.h
+++ b/clang/lib/Driver/ToolChains/Arch/Sparc.h
@@ -31,7 +31,8 @@ FloatABI getSparcFloatABI(const Driver &D, const llvm::opt::ArgList &Args);
 std::string getSparcTargetCPU(const Driver &D, const llvm::opt::ArgList &Args,
                               const llvm::Triple &Triple);
 
-void getSparcTargetFeatures(const Driver &D, const llvm::opt::ArgList &Args,
+void getSparcTargetFeatures(const Driver &D, const llvm::Triple &Triple,
+                            const llvm::opt::ArgList &Args,
                             std::vector<llvm::StringRef> &Features);
 const char *getSparcAsmModeForCPU(llvm::StringRef Name,
                                   const llvm::Triple &Triple);
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index e670696cd59a..0e07a9cf16c9 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -12,6 +12,7 @@
 #include "clang/Driver/CommonArgs.h"
 #include "clang/Driver/InputInfo.h"
 
+#include "Arch/AArch64.h"
 #include "Arch/ARM.h"
 #include "Arch/RISCV.h"
 #include "clang/Driver/Compilation.h"
@@ -31,21 +32,6 @@ using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
 
-/// Is the triple {aarch64.aarch64_be}-none-elf?
-static bool isAArch64BareMetal(const llvm::Triple &Triple) {
-  if (Triple.getArch() != llvm::Triple::aarch64 &&
-      Triple.getArch() != llvm::Triple::aarch64_be)
-    return false;
-
-  if (Triple.getVendor() != llvm::Triple::UnknownVendor)
-    return false;
-
-  if (Triple.getOS() != llvm::Triple::UnknownOS)
-    return false;
-
-  return Triple.getEnvironmentName() == "elf";
-}
-
 static bool isRISCVBareMetal(const llvm::Triple &Triple) {
   if (!Triple.isRISCV())
     return false;
@@ -363,8 +349,9 @@ void BareMetal::findMultilibs(const Driver &D, const llvm::Triple &Triple,
 }
 
 bool BareMetal::handlesTarget(const llvm::Triple &Triple) {
-  return arm::isARMEABIBareMetal(Triple) || isAArch64BareMetal(Triple) ||
-         isRISCVBareMetal(Triple) || isPPCBareMetal(Triple);
+  return arm::isARMEABIBareMetal(Triple) ||
+         aarch64::isAArch64BareMetal(Triple) || isRISCVBareMetal(Triple) ||
+         isPPCBareMetal(Triple);
 }
 
 Tool *BareMetal::buildLinker() const {
@@ -427,10 +414,22 @@ void BareMetal::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
   const SmallString<128> SysRootDir(computeSysRoot());
   if (!SysRootDir.empty()) {
     for (const Multilib &M : getOrderedMultilibs()) {
-      SmallString<128> Dir(SysRootDir);
-      llvm::sys::path::append(Dir, M.includeSuffix());
-      llvm::sys::path::append(Dir, "include");
-      addSystemInclude(DriverArgs, CC1Args, Dir.str());
+      // Downstream issue: #446 (Extend the Multilib system to support an
+      // IncludeDirs field)
+      if (!M.includeDirs().empty()) {
+        // Add include directories specified in multilib.yaml under the
+        // 'IncludeDirs' field
+        for (const std::string &Path : M.includeDirs()) {
+          SmallString<128> Dir(SysRoot);
+          llvm::sys::path::append(Dir, Path);
+          addSystemInclude(DriverArgs, CC1Args, Dir.str());
+        }
+      } else {
+        SmallString<128> Dir(SysRootDir);
+        llvm::sys::path::append(Dir, M.includeSuffix());
+        llvm::sys::path::append(Dir, "include");
+        addSystemInclude(DriverArgs, CC1Args, Dir.str());
+      }
     }
   }
 }
@@ -511,6 +510,18 @@ void BareMetal::AddClangCXXStdlibIncludeArgs(const ArgList &DriverArgs,
         addSystemInclude(DriverArgs, CC1Args, TargetDir.str());
         break;
       }
+      // Downstream issue: #446 (Extend the Multilib system to support an
+      // IncludeDirs field)
+      if (!M.includeDirs().empty()) {
+        // Add include directories specified in multilib.yaml under the
+        // 'IncludeDirs' field
+        for (const std::string &Path : M.includeDirs()) {
+          Dir = SysRoot;
+          llvm::sys::path::append(Dir, Path, "c++", "v1");
+          addSystemInclude(DriverArgs, CC1Args, Dir.str());
+        }
+        break;
+      }
       // Add generic path if nothing else succeeded so far.
       llvm::sys::path::append(Dir, "include", "c++", "v1");
       addSystemInclude(DriverArgs, CC1Args, Dir.str());
@@ -694,9 +705,6 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       NeedCRTs)
     CmdArgs.push_back(Args.MakeArgString(TC.GetFilePath(CRTEnd)));
 
-  if (TC.getTriple().isRISCV())
-    CmdArgs.push_back("-X");
-
   // The R_ARM_TARGET2 relocation must be treated as R_ARM_REL32 on arm*-*-elf
   // and arm*-*-eabi (the default is R_ARM_GOT_PREL, used on arm*-*-linux and
   // arm*-*-*bsd).
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 456bfe885f35..62613322320c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1752,7 +1752,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
                     options::OPT_fno_ptrauth_objc_interface_sel);
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_class_ro,
                     options::OPT_fno_ptrauth_objc_class_ro);
-
   if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
     handlePAuthABI(Args, CmdArgs);
 
@@ -2731,16 +2730,6 @@ static void CollectArgsForIntegratedAssembler(Compilation &C,
     CmdArgs.push_back(MipsTargetFeature);
   }
 
-  // Those OSes default to enabling VIS on 64-bit SPARC.
-  // See also the corresponding code for external assemblers in
-  // sparc::getSparcAsmModeForCPU().
-  bool IsSparcV9ATarget =
-      (C.getDefaultToolChain().getArch() == llvm::Triple::sparcv9) &&
-      (Triple.isOSLinux() || Triple.isOSFreeBSD() || Triple.isOSOpenBSD());
-  if (IsSparcV9ATarget && SparcTargetFeatures.empty()) {
-    CmdArgs.push_back("-target-feature");
-    CmdArgs.push_back("+vis");
-  }
   for (const char *Feature : SparcTargetFeatures) {
     CmdArgs.push_back("-target-feature");
     CmdArgs.push_back(Feature);
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 097d186ad8ea..8d3775de9be5 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -547,15 +547,22 @@ const char *tools::getLDMOption(const llvm::Triple &T, const ArgList &Args) {
   case llvm::Triple::aarch64:
     if (T.isOSManagarm())
       return "aarch64managarm";
+    else if (aarch64::isAArch64BareMetal(T))
+      return "aarch64elf";
     return "aarch64linux";
   case llvm::Triple::aarch64_be:
+    if (aarch64::isAArch64BareMetal(T))
+      return "aarch64elfb";
     return "aarch64linuxb";
   case llvm::Triple::arm:
   case llvm::Triple::thumb:
   case llvm::Triple::armeb:
-  case llvm::Triple::thumbeb:
-    return tools::arm::isARMBigEndian(T, Args) ? "armelfb_linux_eabi"
-                                               : "armelf_linux_eabi";
+  case llvm::Triple::thumbeb: {
+    bool IsBigEndian = tools::arm::isARMBigEndian(T, Args);
+    if (arm::isARMEABIBareMetal(T))
+      return IsBigEndian ? "armelfb" : "armelf";
+    return IsBigEndian ? "armelfb_linux_eabi" : "armelf_linux_eabi";
+  }
   case llvm::Triple::m68k:
     return "m68kelf";
   case llvm::Triple::ppc:
@@ -856,7 +863,7 @@ void tools::getTargetFeatures(const Driver &D, const llvm::Triple &Triple,
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9:
-    sparc::getSparcTargetFeatures(D, Args, Features);
+    sparc::getSparcTargetFeatures(D, Triple, Args, Features);
     break;
   case llvm::Triple::r600:
   case llvm::Triple::amdgcn:
@@ -1320,6 +1327,17 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
   if (Args.hasArg(options::OPT_ftime_report))
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-time-passes"));
+
+  if (Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-remote-compiler=" +
+                           Twine(ToolChain.getDriver().getClangProgramPath())));
+
+    for (auto A : Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
+      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
+  }
 }
 
 void tools::addOpenMPRuntimeLibraryPath(const ToolChain &TC,
@@ -3247,14 +3265,8 @@ void tools::handleVectorizeSLPArgs(const ArgList &Args,
 
 void tools::handleInterchangeLoopsArgs(const ArgList &Args,
                                        ArgStringList &CmdArgs) {
-  // FIXME: instead of relying on shouldEnableVectorizerAtOLevel, we may want to
-  // implement a separate function to infer loop interchange from opt level.
-  // For now, enable loop-interchange at the same opt levels as loop-vectorize.
-  bool EnableInterchange = shouldEnableVectorizerAtOLevel(Args, false);
-  OptSpecifier InterchangeAliasOption =
-      EnableInterchange ? options::OPT_O_Group : options::OPT_floop_interchange;
-  if (Args.hasFlag(options::OPT_floop_interchange, InterchangeAliasOption,
-                   options::OPT_fno_loop_interchange, EnableInterchange))
+  if (Args.hasFlag(options::OPT_floop_interchange,
+                   options::OPT_fno_loop_interchange, false))
     CmdArgs.push_back("-floop-interchange");
 }
 
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index e5075cbcaf66..234683f2f488 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -3187,28 +3187,46 @@ void MachO::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
 
   ToolChain::addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadKind);
 
-  // On arm64e, enable pointer authentication (for the return address and
-  // indirect calls), as well as usage of the intrinsics.
-  if (getArchName() == "arm64e") {
+  // On arm64e, we enable all the features required for the Darwin userspace
+  // ABI
+  if (getTriple().isArm64e()) {
+    // Core platform ABI
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
+                           options::OPT_fno_ptrauth_calls))
+      CC1Args.push_back("-fptrauth-calls");
     if (!DriverArgs.hasArg(options::OPT_fptrauth_returns,
                            options::OPT_fno_ptrauth_returns))
       CC1Args.push_back("-fptrauth-returns");
-
     if (!DriverArgs.hasArg(options::OPT_fptrauth_intrinsics,
                            options::OPT_fno_ptrauth_intrinsics))
       CC1Args.push_back("-fptrauth-intrinsics");
-
-    if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
-                           options::OPT_fno_ptrauth_calls))
-      CC1Args.push_back("-fptrauth-calls");
-
     if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
                            options::OPT_fno_ptrauth_indirect_gotos))
       CC1Args.push_back("-fptrauth-indirect-gotos");
-
     if (!DriverArgs.hasArg(options::OPT_fptrauth_auth_traps,
                            options::OPT_fno_ptrauth_auth_traps))
       CC1Args.push_back("-fptrauth-auth-traps");
+
+    // C++ v-table ABI
+    if (!DriverArgs.hasArg(
+            options::OPT_fptrauth_vtable_pointer_address_discrimination,
+            options::OPT_fno_ptrauth_vtable_pointer_address_discrimination))
+      CC1Args.push_back("-fptrauth-vtable-pointer-address-discrimination");
+    if (!DriverArgs.hasArg(
+            options::OPT_fptrauth_vtable_pointer_type_discrimination,
+            options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
+      CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
+
+    // Objective-C ABI
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_objc_isa,
+                           options::OPT_fno_ptrauth_objc_isa))
+      CC1Args.push_back("-fptrauth-objc-isa");
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_objc_class_ro,
+                           options::OPT_fno_ptrauth_objc_class_ro))
+      CC1Args.push_back("-fptrauth-objc-class-ro");
+    if (!DriverArgs.hasArg(options::OPT_fptrauth_objc_interface_sel,
+                           options::OPT_fno_ptrauth_objc_interface_sel))
+      CC1Args.push_back("-fptrauth-objc-interface-sel");
   }
 }
 
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 1edb83f7255e..7ab41e9b85a0 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -447,6 +447,7 @@ void Flang::addTargetOptions(const ArgList &Args,
   // Add the target features.
   switch (TC.getArch()) {
   default:
+    getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
     break;
   case llvm::Triple::aarch64:
     getTargetFeatures(D, Triple, Args, CmdArgs, /*ForAs*/ false);
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index b2e36ae6f97c..4894e6a52e93 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -85,11 +85,18 @@ void tools::MinGW::Linker::AddLibGCC(const ArgList &Args,
 
   CmdArgs.push_back("-lmoldname");
   CmdArgs.push_back("-lmingwex");
-  for (auto Lib : Args.getAllArgValues(options::OPT_l))
+  for (auto Lib : Args.getAllArgValues(options::OPT_l)) {
     if (StringRef(Lib).starts_with("msvcr") ||
         StringRef(Lib).starts_with("ucrt") ||
-        StringRef(Lib).starts_with("crtdll"))
+        StringRef(Lib).starts_with("crtdll")) {
+      std::string CRTLib = (llvm::Twine("-l") + Lib).str();
+      // Respect the user's chosen crt variant, but still provide it
+      // again as the last linker argument, because some of the libraries
+      // we added above may depend on it.
+      CmdArgs.push_back(Args.MakeArgStringRef(CRTLib));
       return;
+    }
+  }
   CmdArgs.push_back("-lmsvcrt");
 }
 
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 79b1b6960da1..8f589186af34 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -161,7 +161,7 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (Nopie || Profiling)
     CmdArgs.push_back("-nopie");
 
-  if (Triple.isRISCV64()) {
+  if (Triple.isLoongArch64() || Triple.isRISCV64()) {
     CmdArgs.push_back("-X");
     if (Args.hasArg(options::OPT_mno_relax))
       CmdArgs.push_back("--no-relax");
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 4010f7fbd25b..099994695dec 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -628,9 +628,16 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
       // name.
       !Style.isJavaScript() && Previous.isNot(tok::kw_template) &&
       CurrentState.BreakBeforeParameter) {
-    for (const auto *Tok = &Previous; Tok; Tok = Tok->Previous)
-      if (Tok->FirstAfterPPLine || Tok->is(TT_LineComment))
+    for (const auto *Tok = &Previous; Tok; Tok = Tok->Previous) {
+      if (Tok->is(TT_LineComment))
         return false;
+      if (Tok->is(TT_TemplateCloser)) {
+        Tok = Tok->MatchingParen;
+        assert(Tok);
+      }
+      if (Tok->FirstAfterPPLine)
+        return false;
+    }
 
     return true;
   }
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 78c09be458f0..513fcfcd4125 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -727,6 +727,7 @@ template <> struct MappingTraits<FormatStyle::SpaceBeforeParensCustom> {
     IO.mapOptional("AfterFunctionDeclarationName",
                    Spacing.AfterFunctionDeclarationName);
     IO.mapOptional("AfterIfMacros", Spacing.AfterIfMacros);
+    IO.mapOptional("AfterNot", Spacing.AfterNot);
     IO.mapOptional("AfterOverloadedOperator", Spacing.AfterOverloadedOperator);
     IO.mapOptional("AfterPlacementOperator", Spacing.AfterPlacementOperator);
     IO.mapOptional("AfterRequiresInClause", Spacing.AfterRequiresInClause);
@@ -1748,7 +1749,6 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   GoogleStyle.AttributeMacros.push_back("absl_nullable");
   GoogleStyle.AttributeMacros.push_back("absl_nullability_unknown");
   GoogleStyle.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
-  GoogleStyle.DerivePointerAlignment = true;
   GoogleStyle.IncludeStyle.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   GoogleStyle.IncludeStyle.IncludeCategories = {{"^<ext/.*\\.h>", 2, 0, false},
                                                 {"^<.*\\.h>", 1, 0, false},
@@ -1857,6 +1857,7 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   } else if (Language == FormatStyle::LK_ObjC) {
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
     GoogleStyle.ColumnLimit = 100;
+    GoogleStyle.DerivePointerAlignment = true;
     // "Regroup" doesn't work well for ObjC yet (main header heuristic,
     // relationship between ObjC standard library headers and other heades,
     // #imports, etc.)
diff --git a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
index 87823ae32b11..aa752f5e3148 100644
--- a/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
+++ b/clang/lib/Format/IntegerLiteralSeparatorFixer.cpp
@@ -45,15 +45,18 @@ std::pair<tooling::Replacements, unsigned>
 IntegerLiteralSeparatorFixer::process(const Environment &Env,
                                       const FormatStyle &Style) {
   switch (Style.Language) {
-  case FormatStyle::LK_Cpp:
-  case FormatStyle::LK_ObjC:
-    Separator = '\'';
-    break;
   case FormatStyle::LK_CSharp:
   case FormatStyle::LK_Java:
   case FormatStyle::LK_JavaScript:
     Separator = '_';
     break;
+  case FormatStyle::LK_Cpp:
+  case FormatStyle::LK_ObjC:
+    if (Style.Standard >= FormatStyle::LS_Cpp14) {
+      Separator = '\'';
+      break;
+    }
+    [[fallthrough]];
   default:
     return {};
   }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 739209a5681f..cbeb5ef7e4bf 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2590,6 +2590,9 @@ class AnnotatingParser {
     if (!Tok.Previous || Tok.isNot(tok::identifier) || Tok.is(TT_ClassHeadName))
       return false;
 
+    if (Tok.endsSequence(Keywords.kw_final, TT_ClassHeadName))
+      return false;
+
     if ((Style.isJavaScript() || Style.isJava()) && Tok.is(Keywords.kw_extends))
       return false;
 
@@ -2996,14 +2999,18 @@ class AnnotatingParser {
     const FormatToken *PrevToken = Tok.getPreviousNonComment();
     if (!PrevToken)
       return TT_UnaryOperator;
-    if (PrevToken->is(TT_TypeName))
+    if (PrevToken->isTypeName(LangOpts))
       return TT_PointerOrReference;
     if (PrevToken->isPlacementOperator() && Tok.is(tok::ampamp))
       return TT_BinaryOperator;
 
-    const FormatToken *NextToken = Tok.getNextNonComment();
+    auto *NextToken = Tok.getNextNonComment();
     if (!NextToken)
       return TT_PointerOrReference;
+    if (NextToken->is(tok::greater)) {
+      NextToken->setFinalizedType(TT_TemplateCloser);
+      return TT_PointerOrReference;
+    }
 
     if (InTemplateArgument && NextToken->is(tok::kw_noexcept))
       return TT_BinaryOperator;
@@ -3112,7 +3119,7 @@ class AnnotatingParser {
 
     // It's more likely that & represents operator& than an uninitialized
     // reference.
-    if (Tok.is(tok::amp) && PrevToken && PrevToken->Tok.isAnyIdentifier() &&
+    if (Tok.is(tok::amp) && PrevToken->Tok.isAnyIdentifier() &&
         IsChainedOperatorAmpOrMember(PrevToken->getPreviousNonComment()) &&
         NextToken && NextToken->Tok.isAnyIdentifier()) {
       if (auto NextNext = NextToken->getNextNonComment();
@@ -5474,7 +5481,8 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     if (Left.TokenText == "!")
       return Style.SpaceAfterLogicalNot;
     assert(Left.TokenText == "not");
-    return Right.isOneOf(tok::coloncolon, TT_UnaryOperator);
+    return Right.isOneOf(tok::coloncolon, TT_UnaryOperator) ||
+           (Right.is(tok::l_paren) && Style.SpaceBeforeParensOptions.AfterNot);
   }
 
   // If the next token is a binary operator or a selector name, we have
@@ -6266,7 +6274,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   }
 
   if (Right.is(tok::colon) &&
-      !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon)) {
+      !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon,
+                     TT_BitFieldColon)) {
     return false;
   }
   if (Left.is(tok::colon) && Left.isOneOf(TT_DictLiteral, TT_ObjCMethodExpr)) {
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 08cf0ae6b2c2..08f3b7a7fcc4 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1542,6 +1542,17 @@ void CompilerInvocation::setDefaultPointerAuthOptions(
           Discrimination::Constant, InitFiniPointerConstantDiscriminator);
     }
 
+    Opts.BlockInvocationFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockHelperFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    Opts.BlockByrefHelperFunctionPointers =
+        PointerAuthSchema(Key::ASIA, true, Discrimination::None);
+    if (LangOpts.PointerAuthBlockDescriptorPointers)
+      Opts.BlockDescriptorPointers =
+          PointerAuthSchema(Key::ASDA, true, Discrimination::Constant,
+                            BlockDescriptorConstantDiscriminator);
+
     Opts.ObjCMethodListFunctionPointers =
         PointerAuthSchema(Key::ASIA, true, Discrimination::None);
     Opts.ObjCMethodListPointer =
@@ -3598,6 +3609,8 @@ static void GeneratePointerAuthArgs(const LangOptions &Opts,
     GenerateArg(Consumer, OPT_fptrauth_objc_interface_sel);
   if (Opts.PointerAuthObjcClassROPointers)
     GenerateArg(Consumer, OPT_fptrauth_objc_class_ro);
+  if (Opts.PointerAuthBlockDescriptorPointers)
+    GenerateArg(Consumer, OPT_fptrauth_block_descriptor_pointers);
 }
 
 static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
@@ -3621,7 +3634,8 @@ static void ParsePointerAuthArgs(LangOptions &Opts, ArgList &Args,
   Opts.PointerAuthELFGOT = Args.hasArg(OPT_fptrauth_elf_got);
   Opts.AArch64JumpTableHardening =
       Args.hasArg(OPT_faarch64_jump_table_hardening);
-
+  Opts.PointerAuthBlockDescriptorPointers =
+      Args.hasArg(OPT_fptrauth_block_descriptor_pointers);
   Opts.PointerAuthObjcIsa = Args.hasArg(OPT_fptrauth_objc_isa);
   Opts.PointerAuthObjcClassROPointers = Args.hasArg(OPT_fptrauth_objc_class_ro);
   Opts.PointerAuthObjcInterfaceSel =
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 34fb825e9d42..cce8392950b0 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1535,6 +1535,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
 #undef TARGET_OS
   }
 
+  if (LangOpts.PointerAuthIntrinsics)
+    Builder.defineMacro("__PTRAUTH__");
+
   // Get other target #defines.
   TI.getTargetDefines(LangOpts, Builder);
 }
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index 7e614f7740bf..9d96e36c74ca 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -197,7 +197,7 @@ _mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwsud_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -218,7 +218,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwsuds_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -239,7 +239,7 @@ _mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwusd_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -260,7 +260,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwusds_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -281,7 +281,7 @@ _mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwuud_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
@@ -302,7 +302,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuuds_epi32(
-    __m512i __A, __mmask16 __U, __m512i __B, __m512i __C) {
+    __mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
   return (__m512i)__builtin_ia32_selectd_512(
       (__mmask16)__U, (__v16si)_mm512_dpwuuds_epi32(__A, __B, __C),
       (__v16si)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx10_2niintrin.h b/clang/lib/Headers/avx10_2niintrin.h
index 992be18f7720..d5a66cfef536 100644
--- a/clang/lib/Headers/avx10_2niintrin.h
+++ b/clang/lib/Headers/avx10_2niintrin.h
@@ -253,7 +253,7 @@ _mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwsud_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -266,7 +266,7 @@ _mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwsud_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -279,7 +279,7 @@ _mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwsuds_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -292,7 +292,7 @@ _mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwsuds_epi32(
-    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+    __mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwsuds_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -305,7 +305,7 @@ _mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwusd_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -318,7 +318,7 @@ _mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwusd_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -331,7 +331,7 @@ _mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwusds_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -344,7 +344,7 @@ _mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwusds_epi32(
-    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+    __mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwusds_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -357,7 +357,7 @@ _mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwuud_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -370,7 +370,7 @@ _mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwuud_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
@@ -383,7 +383,7 @@ _mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) {
+_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
   return (__m128i)__builtin_ia32_selectd_128(
       (__mmask8)__U, (__v4si)_mm_dpwuuds_epi32(__A, __B, __C),
       (__v4si)_mm_setzero_si128());
@@ -396,7 +396,7 @@ _mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwuuds_epi32(
-    __m256i __A, __mmask8 __U, __m256i __B, __m256i __C) {
+    __mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
   return (__m256i)__builtin_ia32_selectd_256(
       (__mmask8)__U, (__v8si)_mm256_dpwuuds_epi32(__A, __B, __C),
       (__v8si)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
index 2b7f5043e09e..6206a347852b 100644
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -697,7 +697,16 @@ template <typename _Tp> struct __remove_address_space<__constant _Tp> {
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
 
-int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
+#ifdef __OPENCL_CPP_VERSION__
+#define CLINKAGE extern "C"
+#else
+#define CLINKAGE
+#endif
+
+CLINKAGE int printf(__constant const char *st, ...)
+    __attribute__((format(printf, 1, 2)));
+
+#undef CLINKAGE
 #endif
 
 #ifdef cl_intel_device_side_avc_motion_estimation
diff --git a/clang/lib/Headers/ptrauth.h b/clang/lib/Headers/ptrauth.h
index 7f7d387cbdfd..f902ca1e3bbd 100644
--- a/clang/lib/Headers/ptrauth.h
+++ b/clang/lib/Headers/ptrauth.h
@@ -95,7 +95,7 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
    __ptrauth qualifier; the compiler will perform this check
    automatically. */
 
-#if __has_feature(ptrauth_intrinsics)
+#if __has_feature(ptrauth_intrinsics) || defined(__PTRAUTH__)
 
 /* Strip the signature from a value without authenticating it.
 
@@ -388,6 +388,6 @@ typedef __UINTPTR_TYPE__ ptrauth_generic_signature_t;
 #define __ptrauth_objc_isa_uintptr
 #define __ptrauth_objc_super_pointer
 
-#endif /* __has_feature(ptrauth_intrinsics) */
+#endif /* __has_feature(ptrauth_intrinsics) || defined(__PTRAUTH__) */
 
 #endif /* __PTRAUTH_H */
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 1f695b4a8676..b282a600c0e5 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -174,8 +174,6 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
   ExtendedTokenMode = 0;
 
   NewLinePtr = nullptr;
-
-  IsFirstPPToken = true;
 }
 
 /// Lexer constructor - Create a new lexer object for the specified buffer
@@ -3225,7 +3223,6 @@ std::optional<Token> Lexer::peekNextPPToken() {
   bool atStartOfLine = IsAtStartOfLine;
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   bool leadingSpace = HasLeadingSpace;
-  bool isFirstPPToken = IsFirstPPToken;
 
   Token Tok;
   Lex(Tok);
@@ -3236,7 +3233,6 @@ std::optional<Token> Lexer::peekNextPPToken() {
   HasLeadingSpace = leadingSpace;
   IsAtStartOfLine = atStartOfLine;
   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
-  IsFirstPPToken = isFirstPPToken;
   // Restore the lexer back to non-skipping mode.
   LexingRawMode = false;
 
@@ -3726,11 +3722,6 @@ bool Lexer::Lex(Token &Result) {
     HasLeadingEmptyMacro = false;
   }
 
-  if (IsFirstPPToken) {
-    Result.setFlag(Token::FirstPPToken);
-    IsFirstPPToken = false;
-  }
-
   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
   IsAtPhysicalStartOfLine = false;
   bool isRawLex = isLexingRawMode();
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index a62508e3e27b..5b08d7f0efe5 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1467,7 +1467,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
   if (s != PossibleNewDigitStart)
     DigitsBegin = PossibleNewDigitStart;
   else
-    IsSingleZero = (s == ThisTokEnd); // Is the only thing we've seen a 0?
+    IsSingleZero = (s == ThisTokBegin + 1);
 
   if (s == ThisTokEnd)
     return; // Done, simple octal number like 01234
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index bcd3ea60ce3d..2120e45dd8f8 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -43,6 +43,7 @@
 #include "clang/Lex/MacroArgs.h"
 #include "clang/Lex/MacroInfo.h"
 #include "clang/Lex/ModuleLoader.h"
+#include "clang/Lex/NoTrivialPPDirectiveTracer.h"
 #include "clang/Lex/Pragma.h"
 #include "clang/Lex/PreprocessingRecord.h"
 #include "clang/Lex/PreprocessorLexer.h"
@@ -247,8 +248,6 @@ void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const {
     llvm::errs() << " [LeadingSpace]";
   if (Tok.isExpandDisabled())
     llvm::errs() << " [ExpandDisabled]";
-  if (Tok.isFirstPPToken())
-    llvm::errs() << " [First pp-token]";
   if (Tok.needsCleaning()) {
     const char *Start = SourceMgr.getCharacterData(Tok.getLocation());
     llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength())
@@ -577,8 +576,11 @@ void Preprocessor::EnterMainSourceFile() {
     // export module M; // error: module declaration must occur
     //                  //        at the start of the translation unit.
     if (getLangOpts().CPlusPlusModules) {
+      auto Tracer = std::make_unique<NoTrivialPPDirectiveTracer>(*this);
+      DirTracer = Tracer.get();
+      addPPCallbacks(std::move(Tracer));
       std::optional<Token> FirstPPTok = CurLexer->peekNextPPToken();
-      if (FirstPPTok && FirstPPTok->isFirstPPToken())
+      if (FirstPPTok)
         FirstPPTokenLoc = FirstPPTok->getLocation();
     }
   }
@@ -940,6 +942,8 @@ void Preprocessor::Lex(Token &Result) {
       StdCXXImportSeqState.handleHeaderName();
       break;
     case tok::kw_export:
+      if (hasSeenNoTrivialPPDirective())
+        Result.setFlag(Token::HasSeenNoTrivialPPDirective);
       TrackGMFState.handleExport();
       StdCXXImportSeqState.handleExport();
       ModuleDeclState.handleExport();
@@ -966,6 +970,8 @@ void Preprocessor::Lex(Token &Result) {
           }
           break;
         } else if (Result.getIdentifierInfo() == getIdentifierInfo("module")) {
+          if (hasSeenNoTrivialPPDirective())
+            Result.setFlag(Token::HasSeenNoTrivialPPDirective);
           TrackGMFState.handleModule(StdCXXImportSeqState.afterTopLevelSeq());
           ModuleDeclState.handleModule();
           break;
@@ -1680,3 +1686,31 @@ const char *Preprocessor::getCheckPoint(FileID FID, const char *Start) const {
 
   return nullptr;
 }
+
+bool Preprocessor::hasSeenNoTrivialPPDirective() const {
+  return DirTracer && DirTracer->hasSeenNoTrivialPPDirective();
+}
+
+bool NoTrivialPPDirectiveTracer::hasSeenNoTrivialPPDirective() const {
+  return SeenNoTrivialPPDirective;
+}
+
+void NoTrivialPPDirectiveTracer::setSeenNoTrivialPPDirective() {
+  if (InMainFile && !SeenNoTrivialPPDirective)
+    SeenNoTrivialPPDirective = true;
+}
+
+void NoTrivialPPDirectiveTracer::LexedFileChanged(
+    FileID FID, LexedFileChangeReason Reason,
+    SrcMgr::CharacteristicKind FileType, FileID PrevFID, SourceLocation Loc) {
+  InMainFile = (FID == PP.getSourceManager().getMainFileID());
+}
+
+void NoTrivialPPDirectiveTracer::MacroExpands(const Token &MacroNameTok,
+                                              const MacroDefinition &MD,
+                                              SourceRange Range,
+                                              const MacroArgs *Args) {
+  // FIXME: Does only enable builtin macro expansion make sense?
+  if (!MD.getMacroInfo()->isBuiltinMacro())
+    setSeenNoTrivialPPDirective();
+}
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index bc238a9517a3..3515343202de 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -3342,7 +3342,8 @@ ExprResult Parser::ParseBlockLiteralExpression() {
     Actions.ActOnBlockError(CaretLoc, getCurScope());
     return ExprError();
   }
-
+  EnterExpressionEvaluationContextForFunction PotentiallyEvaluated(
+       Actions, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
   StmtResult Stmt(ParseCompoundStatementBody());
   BlockScope.Exit();
   if (!Stmt.isInvalid())
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 8834bf80c401..447d0283431b 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -2361,9 +2361,10 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   // Parse a global-module-fragment, if present.
   if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) {
     SourceLocation SemiLoc = ConsumeToken();
-    if (!Introducer.isFirstPPToken()) {
+    if (ImportState != Sema::ModuleImportState::FirstDecl ||
+        Introducer.hasSeenNoTrivialPPDirective()) {
       Diag(StartLoc, diag::err_global_module_introducer_not_at_start)
-        << SourceRange(StartLoc, SemiLoc);
+          << SourceRange(StartLoc, SemiLoc);
       return nullptr;
     }
     if (MDK == Sema::ModuleDeclKind::Interface) {
@@ -2418,7 +2419,8 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) {
   ExpectAndConsumeSemi(diag::err_module_expected_semi);
 
   return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, Partition,
-                                 ImportState, Introducer.isFirstPPToken());
+                                 ImportState,
+                                 Introducer.hasSeenNoTrivialPPDirective());
 }
 
 Decl *Parser::ParseModuleImport(SourceLocation AtLoc,
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 5e75c64eb2b9..ec39bca6039f 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -503,8 +503,12 @@ static bool areAllValuesNoReturn(const VarDecl *VD, const CFGBlock &VarBlk,
 
   TransferFunctions TF(VD);
   BackwardDataflowWorklist Worklist(*AC.getCFG(), AC);
+  llvm::DenseSet<const CFGBlock *> Visited;
   Worklist.enqueueBlock(&VarBlk);
   while (const CFGBlock *B = Worklist.dequeue()) {
+    if (Visited.contains(B))
+      continue;
+    Visited.insert(B);
     // First check the current block.
     for (CFGBlock::const_reverse_iterator ri = B->rbegin(), re = B->rend();
          ri != re; ++ri) {
@@ -2887,8 +2891,7 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
       .setAlwaysAdd(Stmt::UnaryOperatorClass);
   }
 
-  bool EnableLifetimeSafetyAnalysis = !Diags.isIgnored(
-      diag::warn_experimental_lifetime_safety_dummy_warning, D->getBeginLoc());
+  bool EnableLifetimeSafetyAnalysis = S.getLangOpts().EnableLifetimeSafety;
   // Install the logical handler.
   std::optional<LogicalErrorHandler> LEH;
   if (LogicalErrorHandler::hasActiveDiagnostics(Diags, D->getBeginLoc())) {
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 834417f8e15a..20567a6d9d1a 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1097,10 +1097,6 @@ static bool CheckFunctionConstraintsWithoutInstantiation(
   }
 
   Sema::ContextRAII SavedContext(SemaRef, FD);
-  std::optional<Sema::CXXThisScopeRAII> ThisScope;
-  if (auto *Method = dyn_cast<CXXMethodDecl>(FD))
-    ThisScope.emplace(SemaRef, /*Record=*/Method->getParent(),
-                      /*ThisQuals=*/Method->getMethodQualifiers());
   return SemaRef.CheckConstraintSatisfaction(
       Template, TemplateAC, MLTAL, PointOfInstantiation, Satisfaction);
 }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 14403e65e8f4..d43dc2297c6f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3267,6 +3267,14 @@ void Sema::mergeDeclAttributes(NamedDecl *New, Decl *Old,
     if (isa<UsedAttr>(I) || isa<RetainAttr>(I))
       continue;
 
+    if (isa<InferredNoReturnAttr>(I)) {
+      if (auto *FD = dyn_cast<FunctionDecl>(New)) {
+        if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+          continue; // Don't propagate inferred noreturn attributes to explicit
+                    // specializations.
+      }
+    }
+
     if (mergeDeclAttribute(*this, New, I, LocalAMK))
       foundAny = true;
   }
@@ -19643,6 +19651,7 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
                  Q && Q.isAddressDiscriminated()) {
         Record->setArgPassingRestrictions(
             RecordArgPassingKind::CanNeverPassInRegs);
+        Record->setNonTrivialToPrimitiveCopy(true);
       }
     }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5f481ed1f713..a7897bdfe6e0 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -1970,6 +1970,13 @@ void clang::inferNoReturnAttr(Sema &S, const Decl *D) {
   if (!FD)
     return;
 
+  // Skip explicit specializations here as they may have
+  // a user-provided definition that may deliberately differ from the primary
+  // template. If an explicit specialization truly never returns, the user
+  // should explicitly mark it with [[noreturn]].
+  if (FD->getTemplateSpecializationKind() == TSK_ExplicitSpecialization)
+    return;
+
   auto *NonConstFD = const_cast<FunctionDecl *>(FD);
   DiagnosticsEngine &Diags = S.getDiagnostics();
   if (Diags.isIgnored(diag::warn_falloff_nonvoid, FD->getLocation()) &&
@@ -5011,10 +5018,10 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
 
 static void handleNonStringAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   // This only applies to fields and variable declarations which have an array
-  // type.
+  // type or pointer type, with character elements.
   QualType QT = cast<ValueDecl>(D)->getType();
-  if (!QT->isArrayType() ||
-      !QT->getBaseElementTypeUnsafe()->isAnyCharacterType()) {
+  if ((!QT->isArrayType() && !QT->isPointerType()) ||
+      !QT->getPointeeOrArrayElementType()->isAnyCharacterType()) {
     S.Diag(D->getBeginLoc(), diag::warn_attribute_non_character_array)
         << AL << AL.isRegularKeywordAttribute() << QT << AL.getRange();
     return;
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 728ada33e2e6..5421e9562c8b 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9309,14 +9309,14 @@ AssignConvertType Sema::CheckAssignmentConstraints(QualType LHSType,
   // If we have an atomic type, try a non-atomic assignment, then just add an
   // atomic qualification step.
   if (const AtomicType *AtomicTy = dyn_cast<AtomicType>(LHSType)) {
-    AssignConvertType result =
+    AssignConvertType Result =
         CheckAssignmentConstraints(AtomicTy->getValueType(), RHS, Kind);
-    if (result != AssignConvertType::Compatible)
-      return result;
+    if (!IsAssignConvertCompatible(Result))
+      return Result;
     if (Kind != CK_NoOp && ConvertRHS)
       RHS = ImpCastExprToType(RHS.get(), AtomicTy->getValueType(), Kind);
     Kind = CK_NonAtomicToAtomic;
-    return AssignConvertType::Compatible;
+    return Result;
   }
 
   // If the left-hand side is a reference type, then we are in a
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index 7c982bcd63d7..10a390286ec7 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -264,10 +264,11 @@ Sema::DeclGroupPtrTy
 Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
                       ModuleDeclKind MDK, ModuleIdPath Path,
                       ModuleIdPath Partition, ModuleImportState &ImportState,
-                      bool IntroducerIsFirstPPToken) {
+                      bool SeenNoTrivialPPDirective) {
   assert(getLangOpts().CPlusPlusModules &&
          "should only have module decl in standard C++ modules");
 
+  bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl;
   bool SeenGMF = ImportState == ModuleImportState::GlobalFragment;
   // If any of the steps here fail, we count that as invalidating C++20
   // module state;
@@ -335,7 +336,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc,
 
   // In C++20, A module directive may only appear as the first preprocessing
   // tokens in a file (excluding the global module fragment.).
-  if (getLangOpts().CPlusPlusModules && !IntroducerIsFirstPPToken && !SeenGMF) {
+  if (getLangOpts().CPlusPlusModules &&
+      (!IsFirstDecl || SeenNoTrivialPPDirective) && !SeenGMF) {
     Diag(ModuleLoc, diag::err_module_decl_not_at_start);
     SourceLocation BeginLoc = PP.getMainFileFirstPPTokenLoc();
     Diag(BeginLoc, diag::note_global_module_introducer_missing)
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index f3baf0c3ef3b..1b54628c5e56 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -245,7 +245,6 @@ void StandardConversionSequence::setAsIdentityConversion() {
   IsLvalueReference = true;
   BindsToFunctionLvalue = false;
   BindsToRvalue = false;
-  IsImplicitObjectArgumentQualificationConversion = false;
   BindsImplicitObjectArgumentWithoutRefQualifier = false;
   ObjCLifetimeConversionBinding = false;
   FromBracedInitList = false;
@@ -5318,7 +5317,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.Standard.DirectBinding = BindsDirectly;
     ICS.Standard.IsLvalueReference = !isRValRef;
     ICS.Standard.BindsToFunctionLvalue = T2->isFunctionType();
-    ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false;
     ICS.Standard.BindsToRvalue = InitCategory.isRValue();
     ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.Standard.ObjCLifetimeConversionBinding =
@@ -5498,7 +5496,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.Standard.IsLvalueReference = !isRValRef;
     ICS.Standard.BindsToFunctionLvalue = false;
     ICS.Standard.BindsToRvalue = true;
-    ICS.Standard.IsImplicitObjectArgumentQualificationConversion = false;
     ICS.Standard.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.Standard.ObjCLifetimeConversionBinding = false;
   } else if (ICS.isUserDefined()) {
@@ -5521,8 +5518,6 @@ TryReferenceInit(Sema &S, Expr *Init, QualType DeclType,
     ICS.UserDefined.After.IsLvalueReference = !isRValRef;
     ICS.UserDefined.After.BindsToFunctionLvalue = false;
     ICS.UserDefined.After.BindsToRvalue = !LValRefType;
-    ICS.UserDefined.After.IsImplicitObjectArgumentQualificationConversion =
-        false;
     ICS.UserDefined.After.BindsImplicitObjectArgumentWithoutRefQualifier = false;
     ICS.UserDefined.After.ObjCLifetimeConversionBinding = false;
     ICS.UserDefined.After.FromBracedInitList = false;
@@ -5807,7 +5802,6 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       StandardConversionSequence &SCS = Result.isStandard() ? Result.Standard :
                                             Result.UserDefined.After;
       SCS.ReferenceBinding = true;
-      SCS.IsImplicitObjectArgumentQualificationConversion = false;
       SCS.IsLvalueReference = ToType->isLValueReferenceType();
       SCS.BindsToRvalue = true;
       SCS.BindsToFunctionLvalue = false;
@@ -6005,12 +5999,8 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   // affects the conversion rank.
   QualType ClassTypeCanon = S.Context.getCanonicalType(ClassType);
   ImplicitConversionKind SecondKind;
-  bool IsQualificationConversion = false;
-  if (ImplicitParamType.getCanonicalType() == FromTypeCanon) {
+  if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
     SecondKind = ICK_Identity;
-  } else if (ClassTypeCanon == FromTypeCanon.getLocalUnqualifiedType()) {
-    SecondKind = ICK_Identity;
-    IsQualificationConversion = true;
   } else if (S.IsDerivedFrom(Loc, FromType, ClassType)) {
     SecondKind = ICK_Derived_To_Base;
   } else if (!Method->isExplicitObjectMemberFunction()) {
@@ -6051,8 +6041,6 @@ static ImplicitConversionSequence TryObjectArgumentInitialization(
   ICS.Standard.setFromType(FromType);
   ICS.Standard.setAllToTypes(ImplicitParamType);
   ICS.Standard.ReferenceBinding = true;
-  ICS.Standard.IsImplicitObjectArgumentQualificationConversion =
-      IsQualificationConversion;
   ICS.Standard.DirectBinding = true;
   ICS.Standard.IsLvalueReference = Method->getRefQualifier() != RQ_RValue;
   ICS.Standard.BindsToFunctionLvalue = false;
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index f85826aecadf..f46be75bda20 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -2287,7 +2287,11 @@ StmtResult Sema::ActOnForStmt(SourceLocation ForLoc, SourceLocation LParenLoc,
           // we can diagnose if we don't see any variable declarations. This
           // covers a case like declaring a typedef, function, or structure
           // type rather than a variable.
-          NonVarSeen = DI;
+          //
+          // Note, _Static_assert is acceptable because it does not declare an
+          // identifier at all, so "for object having" does not apply.
+          if (!isa<StaticAssertDecl>(DI))
+            NonVarSeen = DI;
         }
       }
       // Diagnose if we saw a non-variable declaration but no variable
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index b76619fc5026..f067aedf1cc9 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4749,8 +4749,6 @@ Sema::CheckConceptTemplateId(const CXXScopeSpec &SS,
   EnterExpressionEvaluationContext EECtx{
       *this, ExpressionEvaluationContext::Unevaluated, CSD};
 
-  ContextRAII CurContext(*this, CSD->getDeclContext(),
-                         /*NewThisContext=*/false);
   if (!AreArgsDependent &&
       CheckConstraintSatisfaction(
           NamedConcept, AssociatedConstraint(NamedConcept->getConstraintExpr()),
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index d09a72b71b80..ce78ecc2d4a2 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -5525,6 +5525,15 @@ static TemplateDeductionResult CheckDeductionConsistency(
   // FIXME: A substitution can be incomplete on a non-structural part of the
   // type. Use the canonical type for now, until the TemplateInstantiator can
   // deal with that.
+
+  // Workaround: Implicit deduction guides use InjectedClassNameTypes, whereas
+  // the explicit guides don't. The substitution doesn't transform these types,
+  // so let it transform their specializations instead.
+  bool IsDeductionGuide = isa<CXXDeductionGuideDecl>(FTD->getTemplatedDecl());
+  if (IsDeductionGuide) {
+    if (auto *Injected = P->getAs<InjectedClassNameType>())
+      P = Injected->getInjectedSpecializationType();
+  }
   QualType InstP = S.SubstType(P.getCanonicalType(), MLTAL, FTD->getLocation(),
                                FTD->getDeclName(), &IsIncompleteSubstitution);
   if (InstP.isNull() && !IsIncompleteSubstitution)
@@ -5539,9 +5548,15 @@ static TemplateDeductionResult CheckDeductionConsistency(
   if (auto *PA = dyn_cast<PackExpansionType>(A);
       PA && !isa<PackExpansionType>(InstP))
     A = PA->getPattern();
-  if (!S.Context.hasSameType(
-          S.Context.getUnqualifiedArrayType(InstP.getNonReferenceType()),
-          S.Context.getUnqualifiedArrayType(A.getNonReferenceType())))
+  auto T1 = S.Context.getUnqualifiedArrayType(InstP.getNonReferenceType());
+  auto T2 = S.Context.getUnqualifiedArrayType(A.getNonReferenceType());
+  if (IsDeductionGuide) {
+    if (auto *Injected = T1->getAs<InjectedClassNameType>())
+      T1 = Injected->getInjectedSpecializationType();
+    if (auto *Injected = T2->getAs<InjectedClassNameType>())
+      T2 = Injected->getInjectedSpecializationType();
+  }
+  if (!S.Context.hasSameType(T1, T2))
     return TemplateDeductionResult::NonDeducedMismatch;
   return TemplateDeductionResult::Success;
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index e2c3cdcd536b..d2b87f2702a9 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -5685,7 +5685,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation,
   };
   Function->setDeclarationNameLoc(NameLocPointsToPattern());
 
-  EnterExpressionEvaluationContext EvalContext(
+  EnterExpressionEvaluationContextForFunction EvalContext(
       *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
 
   Qualifiers ThisTypeQuals;
diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp
index 1d8687e4bf1c..21ba19ee551c 100644
--- a/clang/lib/Sema/SemaTypeTraits.cpp
+++ b/clang/lib/Sema/SemaTypeTraits.cpp
@@ -1768,7 +1768,10 @@ static bool EvaluateBinaryTypeTrait(Sema &Self, TypeTrait BTT,
       // Objective-C lifetime, this is a non-trivial assignment.
       if (LhsT.getNonReferenceType().hasNonTrivialObjCLifetime())
         return false;
-
+      ASTContext &Context = Self.getASTContext();
+      if (Context.containsAddressDiscriminatedPointerAuth(LhsT) ||
+          Context.containsAddressDiscriminatedPointerAuth(RhsT))
+        return false;
       return !Result.get()->hasNonTrivialCall(Self.Context);
     }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp
index 1e3adb4f266c..789c7772d123 100644
--- a/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/AssumeModeling.cpp
@@ -45,7 +45,6 @@ void AssumeModelingChecker::checkPostStmt(const AttributedStmt *A,
       continue;
 
     const auto *Assumption = AssumptionVal.getAsInteger();
-    assert(Assumption && "We should know the exact outcome of an assume expr");
     if (Assumption && Assumption->isZero()) {
       C.addSink();
     }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
index fa8e669b6bb2..ab29f86cec32 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineC.cpp
@@ -916,7 +916,8 @@ VisitUnaryExprOrTypeTraitExpr(const UnaryExprOrTypeTraitExpr *Ex,
   QualType T = Ex->getTypeOfArgument();
 
   for (ExplodedNode *N : CheckedSet) {
-    if (Ex->getKind() == UETT_SizeOf) {
+    if (Ex->getKind() == UETT_SizeOf || Ex->getKind() == UETT_DataSizeOf ||
+        Ex->getKind() == UETT_CountOf) {
       if (!T->isIncompleteType() && !T->isConstantSizeType()) {
         assert(T->isVariableArrayType() && "Unknown non-constant-sized type.");
 
diff --git a/clang/test/ASTMerge/enum/Inputs/enum3.c b/clang/test/ASTMerge/enum/Inputs/enum3.c
new file mode 100644
index 000000000000..32ad5366434a
--- /dev/null
+++ b/clang/test/ASTMerge/enum/Inputs/enum3.c
@@ -0,0 +1,14 @@
+// [C23] missing underlying types
+enum E1 : int {
+  E1Enumerator1
+};
+
+enum E2 {
+  E2Enumerator1
+};
+
+// [C23] Incompatible underlying types
+enum E3 : long {
+  E3Enumerator1
+};
+
diff --git a/clang/test/ASTMerge/enum/Inputs/enum4.c b/clang/test/ASTMerge/enum/Inputs/enum4.c
new file mode 100644
index 000000000000..15f5c603c7ab
--- /dev/null
+++ b/clang/test/ASTMerge/enum/Inputs/enum4.c
@@ -0,0 +1,14 @@
+// [C23] missing underlying types
+enum E1 {
+  E1Enumerator1
+};
+
+enum E2 : int {
+  E2Enumerator1
+};
+
+// [C23] Incompatible underlying types
+enum E3 : short {
+  E3Enumerator1
+};
+
diff --git a/clang/test/ASTMerge/enum/test2.c b/clang/test/ASTMerge/enum/test2.c
new file mode 100644
index 000000000000..bdd8b13ee4c2
--- /dev/null
+++ b/clang/test/ASTMerge/enum/test2.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -std=c23 -emit-pch -o %t.1.ast %S/Inputs/enum3.c
+// RUN: %clang_cc1 -std=c23 -emit-pch -o %t.2.ast %S/Inputs/enum4.c
+// RUN: %clang_cc1 -std=c23 -ast-merge %t.1.ast -ast-merge %t.2.ast -fsyntax-only %s 2>&1 | FileCheck %s
+
+// CHECK: enum3.c:2:6: warning: type 'enum E1' has incompatible definitions in different translation units
+// CHECK: enum4.c:2:6: note: enumeration 'E1' missing fixed underlying type here
+// CHECK: enum3.c:2:6: note: enumeration 'E1' has fixed underlying type here
+// CHECK: enum3.c:6:6: warning: type 'enum E2' has incompatible definitions in different translation units
+// CHECK: enum4.c:6:6: note: enumeration 'E2' has fixed underlying type here
+// CHECK: enum3.c:6:6: note: enumeration 'E2' missing fixed underlying type here
+// CHECK: enum3.c:11:6: warning: type 'enum E3' has incompatible definitions in different translation units
+// CHECK: enum3.c:11:6: note: enumeration 'E3' declared with incompatible fixed underlying types ('long' vs. 'short')
+// CHECK: 3 warnings generated
+
diff --git a/clang/test/Analysis/cxx23-assume-attribute.cpp b/clang/test/Analysis/cxx23-assume-attribute.cpp
index 86e7662cd2af..4cc16446572d 100644
--- a/clang/test/Analysis/cxx23-assume-attribute.cpp
+++ b/clang/test/Analysis/cxx23-assume-attribute.cpp
@@ -69,3 +69,9 @@ int assume_and_fallthrough_at_the_same_attrstmt(int a, int b) {
 
   return 0;
 }
+
+void assume_opaque_gh151854_no_crash() {
+  extern bool opaque();
+  [[assume(opaque())]]; // no-crash
+  // expected-warning@-1 {{assumption is ignored because it contains (potential) side-effects}}
+}
diff --git a/clang/test/Analysis/engine/gh151711.cpp b/clang/test/Analysis/engine/gh151711.cpp
new file mode 100644
index 000000000000..a9950a7a3b9d
--- /dev/null
+++ b/clang/test/Analysis/engine/gh151711.cpp
@@ -0,0 +1,18 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify -x c %s
+
+void clang_analyzer_dump(int);
+
+// Ensure that VLA types are correctly handled by unary type traits in the
+// expression engine. Previously, __datasizeof and _Countof both caused failed
+// assertions.
+void gh151711(int i) {
+  clang_analyzer_dump(sizeof(int[i++]));       // expected-warning {{Unknown}}
+#ifdef __cplusplus
+  // __datasizeof is only available in C++.
+  clang_analyzer_dump(__datasizeof(int[i++])); // expected-warning {{Unknown}}
+#else
+  // _Countof is only available in C.
+  clang_analyzer_dump(_Countof(int[i++]));     // expected-warning {{Unknown}}
+#endif
+}
diff --git a/clang/test/C/C23/n3037.c b/clang/test/C/C23/n3037.c
index ce6f4c4ea7ac..d5699e2df142 100644
--- a/clang/test/C/C23/n3037.c
+++ b/clang/test/C/C23/n3037.c
@@ -401,3 +401,77 @@ _Static_assert(0 == _Generic(inner_anon_tagged.untagged, struct { int i; } : 1,
 // unions and structures are both RecordDecl objects, whereas EnumDecl is not).
 enum { E_Untagged1 } nontag_enum; // both-note {{previous definition is here}}
 _Static_assert(0 == _Generic(nontag_enum, enum { E_Untagged1 } : 1, default : 0)); // both-error {{redefinition of enumerator 'E_Untagged1'}}
+
+// Test that enumerations are compatible with their underlying type, but still
+// diagnose when "same type" is required rather than merely "compatible type".
+enum E1 : int { e1 }; // Fixed underlying type
+enum E2 { e2 };       // Unfixed underlying type, defaults to int or unsigned int
+
+struct GH149965_1 { int h; };
+// This typeof trick is used to get the underlying type of the enumeration in a
+// platform agnostic way.
+struct GH149965_2 { __typeof__(+(enum E2){}) h; };
+void gh149965(void) {
+  extern struct GH149965_1 x1; // c17-note {{previous declaration is here}}
+  extern struct GH149965_2 x2; // c17-note {{previous declaration is here}}
+
+  // Both the structure and the variable declarations are fine because only a
+  // compatible type is required, not the same type, because the structures are
+  // declared in different scopes.
+  struct GH149965_1 { enum E1 h; };
+  struct GH149965_2 { enum E2 h; };
+
+  extern struct GH149965_1 x1; // c17-error {{redeclaration of 'x1'}}
+  extern struct GH149965_2 x2; // c17-error {{redeclaration of 'x2'}}
+
+  // However, in the same scope, the same type is required, not just compatible
+  // types.
+  // FIXME: this should be an error in both C17 and C23 mode.
+  struct GH149965_3 { int h; };     // c17-note {{previous definition is here}}
+  struct GH149965_3 { enum E1 h; }; // c17-error {{redefinition of 'GH149965_3'}}
+
+  // For Clang, the composite type after declaration merging is the enumeration
+  // type rather than an integer type.
+  enum E1 *eptr;
+  [[maybe_unused]] __typeof__(x1.h) *ptr = eptr;
+  enum E2 *eptr2;
+  [[maybe_unused]] __typeof__(x2.h) *ptr2 = eptr2;
+}
+
+// Test that enumerations with mixed underlying types are properly handled.
+enum GH150594_E1 : int { GH150594_Val1 };
+enum GH150594_E2 : int { GH150594_Val2 };
+enum GH150594_E3 { GH150594_Val3 };
+enum GH150594_E4 : int { GH150594_Val4 };
+void GH150594(void) {
+  extern enum GH150594_E1 Fn1(void); // both-note {{previous declaration is here}}
+  extern enum GH150594_E2 Fn2(void); // c17-note {{previous declaration is here}}
+  extern enum GH150594_E3 Fn3(void); // both-note {{previous declaration is here}}
+  extern enum GH150594_E4 Fn4(void); // both-note {{previous declaration is here}}
+  enum GH150594_E1 { GH150594_Val1 };
+  enum GH150594_E2 : int { GH150594_Val2 };
+  enum GH150594_E3 : int { GH150594_Val3 };
+  enum GH150594_E4 : short { GH150594_Val4 };
+  extern enum GH150594_E1 Fn1(void); // both-error {{conflicting types for 'Fn1'}}
+  extern enum GH150594_E2 Fn2(void); // c17-error {{conflicting types for 'Fn2'}}
+  extern enum GH150594_E3 Fn3(void); // both-error {{conflicting types for 'Fn3'}}
+  extern enum GH150594_E4 Fn4(void); // both-error {{conflicting types for 'Fn4'}}
+
+  // Show that two declarations in the same scope give expected diagnostics.
+  enum E1 { e1 };       // both-note {{previous declaration is here}}
+  enum E1 : int { e1 }; // both-error {{enumeration previously declared with nonfixed underlying type}}
+
+  enum E2 : int { e2 }; // both-note {{previous declaration is here}}
+  enum E2 { e2 };       // both-error {{enumeration previously declared with fixed underlying type}}
+
+  enum E3 : int { e3 };   // both-note {{previous declaration is here}}
+  enum E3 : short { e3 }; // both-error {{enumeration redeclared with different underlying type 'short' (was 'int')}}
+
+  typedef short foo;
+  enum E4 : foo { e4 };   // c17-note 2 {{previous definition is here}}
+  enum E4 : short { e4 }; // c17-error {{redefinition of 'E4'}} \
+                             c17-error {{redefinition of enumerator 'e4'}}
+
+  enum E5 : foo { e5 }; // both-note {{previous declaration is here}}
+  enum E5 : int { e5 }; // both-error {{enumeration redeclared with different underlying type 'int' (was 'foo' (aka 'short'))}}
+}
diff --git a/clang/test/C/C2y/n3353.c b/clang/test/C/C2y/n3353.c
index cd61cbf03906..a2e08cf6344d 100644
--- a/clang/test/C/C2y/n3353.c
+++ b/clang/test/C/C2y/n3353.c
@@ -44,7 +44,12 @@ static const void *ptr = 0o0;  /* ext-warning {{octal integer literals are a C2y
 #endif
 
 // 0 by itself is not deprecated, of course.
-int k = 0;
+int k1                = 0;
+unsigned int k2       = 0u;
+long k3               = 0l;
+unsigned long k4      = 0ul;
+long long k5          = 0ll;
+unsigned long long k6 = 0ull;
 
 // Test a preprocessor use of 0 by itself, which is also not deprecated.
 #if 0
@@ -65,7 +70,6 @@ static_assert(__extension__ _Generic(typeof(l), const int : 1, default : 0)); //
 
 // Note that 0o by itself is an invalid literal.
 int m = 0o; /* expected-error {{invalid suffix 'o' on integer constant}}
-               c2y-warning {{octal literals without a '0o' prefix are deprecated}}
              */
 
 // Ensure negation works as expected.
@@ -83,13 +87,11 @@ int n = 0o18; /* expected-error {{invalid digit '8' in octal constant}}
                  cpp-warning {{octal integer literals are a Clang extension}}
                */
 int o1 = 0o8; /* expected-error {{invalid suffix 'o8' on integer constant}}
-                 c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                */
 // FIXME: however, it matches the behavior for hex literals in terms of the
 // error reported. Unfortunately, we then go on to think 0 is an octal literal
 // without a prefix, which is again a bit confusing.
 int o2 = 0xG; /* expected-error {{invalid suffix 'xG' on integer constant}}
-                 c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                */
 
 // Show that floating-point suffixes on octal literals are rejected.
@@ -130,7 +132,6 @@ constexpr int p = 0o0'1'2'3'4'5'6'7; /* compat-warning {{octal integer literals
                                       */
 static_assert(p == 01234567); // c2y-warning {{octal literals without a '0o' prefix are deprecated}}
 int q = 0o'0'1; /* expected-error {{invalid suffix 'o'0'1' on integer constant}}
-                   c2y-warning {{octal literals without a '0o' prefix are deprecated}}
                  */
 
 #define M 0o123
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index 58080014abc5..286c9d40d2da 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -226,17 +226,6 @@ add_custom_target(clang-test)
 add_dependencies(clang-test check-clang)
 set_target_properties(clang-test PROPERTIES FOLDER "Clang/Tests")
 
-# Allow running Clang Python binding tests separately from CI.
-add_lit_testsuite(check-clang-python "Running the Clang Python tests"
-  ${CMAKE_CURRENT_BINARY_DIR}
-  #LIT ${LLVM_LIT}
-  PARAMS ${CLANG_TEST_PARAMS}
-  DEPENDS ${CLANG_TEST_DEPS}
-  ARGS ${CLANG_TEST_EXTRA_ARGS} --filter=bindings.sh
-  # Avoid running tests twice.
-  EXCLUDE_FROM_CHECK_ALL
-  )
-
 # FIXME: This logic can be removed once all buildbots have moved
 # debuginfo-test from clang/test to llvm/projects or monorepo.
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/debuginfo-tests)
diff --git a/clang/test/CXX/module/cpp.pre/module_decl.cpp b/clang/test/CXX/module/cpp.pre/module_decl.cpp
index 6238347c167a..5c29aeff1b63 100644
--- a/clang/test/CXX/module/cpp.pre/module_decl.cpp
+++ b/clang/test/CXX/module/cpp.pre/module_decl.cpp
@@ -1,8 +1,147 @@
 // RUN: rm -rf %t
 // RUN: mkdir -p %t
-// RUN: %clang_cc1 -std=c++20 -emit-module-interface %s -verify -o %t/M.pcm
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/line.cpp -verify -o %t/line.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/gnu_line_marker.cpp -verify -o %t/gnu_line_marker.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/include.cpp -verify -o %t/include.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/ident.cpp -verify -o %t/ident.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_comment.cpp -verify -o %t/pragma_comment.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_mark.cpp -verify -o %t/pragma_mark.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_detect_mismatch.cpp -verify -o %t/pragma_detect_mismatch.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_clang_debug.cpp -verify -o %t/pragma_clang_debug.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_message.cpp -verify -o %t/pragma_message.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_gcc_warn.cpp -verify -o %t/pragma_gcc_warn.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_gcc_error.cpp -verify -o %t/pragma_gcc_error.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_diag_push_pop.cpp -verify -o %t/pragma_diag_push_pop.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_diag_ignore.cpp -verify -o %t/pragma_diag_ignore.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_opencl_ext.cpp -verify -o %t/pragma_opencl_ext.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_push_pop.cpp -verify -o %t/pragma_push_pop.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_exec_charset.cpp -verify -o %t/pragma_exec_charset.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_clang_assume_nonnull.cpp -verify -o %t/pragma_clang_assume_nonnull.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/marco_expand.cpp -DMACRO="" -verify -o %t/marco_expand.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/define.cpp -verify -o %t/define.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/undef.cpp -verify -o %t/undef.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/defined.cpp -verify -o %t/defined.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/has_embed.cpp -verify -o %t/has_embed.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/has_include.cpp -verify -o %t/has_include.pcm
 
+//--- header.h
+#ifndef HEADER_H
+#define HEADER_H
+
+#endif // HEADER_H
+
+//--- line.cpp
+// expected-no-diagnostics
+#line 3
+export module M;
+
+//--- gnu_line_marker.cpp
+// expected-no-diagnostics
+# 1 __FILE__ 1 3
+export module M;
+
+//--- include.cpp
+#include "header.h" // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- ident.cpp
+// expected-no-diagnostics
+#ident "$Header:$"
+export module M;
+
+//--- pragma_comment.cpp
+// expected-no-diagnostics
+#pragma comment(lib, "msvcrt.lib")
+export module M;
+
+//--- pragma_mark.cpp
+// expected-no-diagnostics
+#pragma mark LLVM's world
+export module M;
+
+//--- pragma_detect_mismatch.cpp
+// expected-no-diagnostics
+#pragma detect_mismatch("test", "1")
+export module M;
+
+//--- pragma_clang_debug.cpp
+// expected-no-diagnostics
+#pragma clang __debug dump Test
+export module M;
+
+//--- pragma_message.cpp
+#pragma message "test" // expected-warning {{test}}
+export module M;
+
+//--- pragma_gcc_warn.cpp
+#pragma GCC warning "Foo" // expected-warning {{Foo}}
+export module M;
+
+//--- pragma_gcc_error.cpp
+#pragma GCC error "Foo" // expected-error {{Foo}}
+export module M;
+
+//--- pragma_diag_push_pop.cpp
+// expected-no-diagnostics
+#pragma gcc diagnostic push
+#pragma gcc diagnostic pop
+export module M;
+
+//--- pragma_diag_ignore.cpp
+// expected-no-diagnostics
+#pragma GCC diagnostic ignored "-Wframe-larger-than"
+export module M;
+
+//--- pragma_opencl_ext.cpp
+// expected-no-diagnostics
+#pragma OPENCL EXTENSION __cl_clang_variadic_functions : enable
+export module M;
+
+//--- pragma_push_pop.cpp
+// expected-no-diagnostics
+#pragma warning(push)
+#pragma warning(pop)
+export module M;
+
+//--- pragma_exec_charset.cpp
+// expected-no-diagnostics
+#pragma execution_character_set(push, "UTF-8")
+#pragma execution_character_set(pop)
+export module M;
+
+//--- pragma_clang_assume_nonnull.cpp
+// expected-no-diagnostics
+#pragma clang assume_nonnull begin
+#pragma clang assume_nonnull end
+export module M;
+
+//--- marco_expand.cpp
+MACRO // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- define.cpp
 // This is a comment
 #define I32 int // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
 export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
 export I32 i32;
+
+//--- undef.cpp
+#undef FOO // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- defined.cpp
+#if defined(FOO) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+#endif
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- has_embed.cpp
+#if __has_embed(__FILE__ ext::token(0xB055)) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}}
+#endif
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
+
+//--- has_include.cpp
+#if __has_include(<stdio.h>) || __has_include_next(<stdlib.h>) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} \
+                                                               // expected-warning {{#include_next in primary source file; will search from start of include path}}
+#endif
+export module M; // expected-error {{module declaration must occur at the start of the translation unit}}
diff --git a/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp b/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
index abb42447d3e0..05830de9891f 100644
--- a/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
+++ b/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp
@@ -239,5 +239,21 @@ void f2() {
 
 }
 
+namespace GH153884 {
+  bool f1() {
+    auto f = [](auto) { return true; };
+    if constexpr (0)
+      return f(1);
+    return false;
+  }
+  bool f2() {
+    auto f = [](auto x) { if (x) return 1.5; else return "wat"; };
+    // expected-error@-1 {{'auto' in return type deduced as 'const char *' here but deduced as 'double' in earlier return statement}}
+    if constexpr (0)
+      return f(1);
+    // expected-note@-1 {{in instantiation of function template specialization 'GH153884::f2()}}
+    return false;
+  }
+}
 
 #endif
diff --git a/clang/test/CodeGen/LoongArch/targetattr-lasx.c b/clang/test/CodeGen/LoongArch/targetattr-lasx.c
new file mode 100644
index 000000000000..56fd6573ed34
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/targetattr-lasx.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature -lsx -emit-llvm %s -o - | FileCheck %s
+
+__attribute__((target("lasx")))
+// CHECK: #[[ATTR0:[0-9]+]] {
+void testlasx() {}
+
+// CHECK: attributes #[[ATTR0]] = { {{.*}}"target-features"="+64bit,+lasx,+lsx"{{.*}} }
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index 26e0d124c828..d143188ee0f3 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -187,12 +187,12 @@ __m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __
   return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwsud_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -208,12 +208,12 @@ __m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, _
   return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -229,12 +229,12 @@ __m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __
   return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwusd_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -250,12 +250,12 @@ __m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, _
   return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwusds_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -271,12 +271,12 @@ __m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __
   return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwuud_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
@@ -292,10 +292,10 @@ __m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, _
   return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
-__m512i test_mm512_maskz_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
+__m512i test_mm512_maskz_dpwuuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
 // CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
-  return _mm512_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+  return _mm512_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index 936be27da61d..b4b12c953194 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -264,11 +264,11 @@ __m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128
   return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwsud_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -278,11 +278,11 @@ __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m
   return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwsud_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -292,11 +292,11 @@ __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m12
   return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -306,11 +306,11 @@ __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __
   return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwsuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwsuds_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -320,11 +320,11 @@ __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128
   return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwusd_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -334,11 +334,11 @@ __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m
   return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwusd_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -348,11 +348,11 @@ __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m12
   return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwusds_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -362,11 +362,11 @@ __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __
   return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwusds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwusds_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -376,11 +376,11 @@ __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128
   return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwuud_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -390,11 +390,11 @@ __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m
   return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwuud_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
@@ -404,11 +404,11 @@ __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m12
   return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
-__m128i test_mm_maskz_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
+__m128i test_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
 // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
-  return _mm_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+  return _mm_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
@@ -418,9 +418,9 @@ __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __
   return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
-__m256i test_mm256_maskz_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
+__m256i test_mm256_maskz_dpwuuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
 // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
-  return _mm256_maskz_dpwuuds_epi32(__A, __B, __C, __D);
+  return _mm256_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/ptrauth-qualifier-blocks.c b/clang/test/CodeGen/ptrauth-qualifier-blocks.c
index 62da59cf327f..f460da205cac 100644
--- a/clang/test/CodeGen/ptrauth-qualifier-blocks.c
+++ b/clang/test/CodeGen/ptrauth-qualifier-blocks.c
@@ -82,9 +82,15 @@ void test_block_address_byref_capture() {
   // CHECK: store i32 33554432,
   // CHECK: store i32 48,
   // CHECK: [[COPY_HELPER_FIELD:%.*]] = getelementptr inbounds nuw [[BYREF_T]], ptr [[BYREF]], i32 0, i32 4
-  // CHECK: store ptr @__Block_byref_object_copy_, ptr [[COPY_HELPER_FIELD]], align
+  // CHECK: [[T0:%.*]] = ptrtoint ptr [[COPY_HELPER_FIELD]] to i64
+  // CHECK: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @__Block_byref_object_copy_ to i64), i32 0, i64 [[T0]])
+  // CHECK: [[T2:%.*]] = inttoptr i64 [[T1]] to ptr
+  // CHECK: store ptr [[T2]], ptr [[COPY_HELPER_FIELD]], align
   // CHECK: [[DISPOSE_HELPER_FIELD:%.*]] = getelementptr inbounds nuw [[BYREF_T]], ptr [[BYREF]], i32 0, i32 5
-  // CHECK: store ptr @__Block_byref_object_dispose_, ptr [[DISPOSE_HELPER_FIELD]], align
+  // CHECK: [[T0:%.*]] = ptrtoint ptr [[DISPOSE_HELPER_FIELD]] to i64
+  // CHECK: [[T1:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @__Block_byref_object_dispose_ to i64), i32 0, i64 [[T0]])
+  // CHECK: [[T2:%.*]] = inttoptr i64 [[T1]] to ptr
+  // CHECK: store ptr [[T2]], ptr [[DISPOSE_HELPER_FIELD]], align
   //   flags - copy/dispose required
   // CHECK: store i32 1107296256, ptr
   __block struct A * __ptrauth(1, 1, 60) ptr = createA();
diff --git a/clang/test/CodeGenCXX/debug-info-class.cpp b/clang/test/CodeGenCXX/debug-info-class.cpp
index 0bc4fdaa565c..aa24a63c58cb 100644
--- a/clang/test/CodeGenCXX/debug-info-class.cpp
+++ b/clang/test/CodeGenCXX/debug-info-class.cpp
@@ -99,12 +99,12 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK %s
-// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK -check-prefix=CHECKELF %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK -check-prefix=CHECKCOFF %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++98 %s -o - | FileCheck -check-prefix=CHECK98 -check-prefix=CHECK -check-prefix=CHECKELF %s
+// RUN: %clang_cc1 -triple x86_64-unknown_unknown -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK -check-prefix=CHECKELF %s
+// RUN: %clang_cc1 -triple i686-cygwin -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK -check-prefix=CHECKCOFF %s
+// RUN: %clang_cc1 -triple armv7l-unknown-linux-gnueabihf -emit-llvm -debug-info-kind=limited -fexceptions -std=c++11 %s -o - | FileCheck -check-prefix=CHECK11 -check-prefix=CHECK -check-prefix=CHECKELF %s
 
 // CHECK98: invoke {{.+}} @_ZN1BD1Ev(ptr {{[^,]*}} %b)
 // CHECK98-NEXT: unwind label %{{.+}}, !dbg ![[EXCEPTLOC:.*]]
@@ -122,6 +122,14 @@ int main(int argc, char **argv) {
 // CHECK-SAME:             ){{$}}
 
 // CHECK:      ![[INT:[0-9]+]] = !DIBasicType(name: "int"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_class_type, name: "B"
+// CHECKCOFF-NOT:              DIFlagFwdDecl
+// CHECKCOFF-SAME:             ){{$}}
+// CHECKCOFF: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
+// CHECKCOFF-SAME:           DIFlagArtificial
 
 // CHECK: [[C:![0-9]*]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "C",
 // CHECK-NOT:                              DIFlagFwdDecl
@@ -137,19 +145,19 @@ int main(int argc, char **argv) {
 // CHECK-SAME:                     DIFlagStaticMember
 // CHECK: [[C_DTOR]] = !DISubprogram(name: "~C"
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "K"
-// CHECK-SAME:             identifier: "_ZTS1K"
-// CHECK-SAME:             ){{$}}
+// CHECKELF: !DICompositeType(tag: DW_TAG_structure_type, name: "K"
+// CHECKELF-SAME:             identifier: "_ZTS1K"
+// CHECKELF-SAME:             ){{$}}
 
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "B"
-// CHECK-NOT:              DIFlagFwdDecl
-// CHECK-SAME:             ){{$}}
-// CHECK: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
-// CHECK-SAME:           DIFlagArtificial
+// CHECKELF: !DICompositeType(tag: DW_TAG_class_type, name: "B"
+// CHECKELF-NOT:              DIFlagFwdDecl
+// CHECKELF-SAME:             ){{$}}
+// CHECKELF: !DIDerivedType(tag: DW_TAG_member, name: "_vptr$B",
+// CHECKELF-SAME:           DIFlagArtificial
 
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
-// CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
-// CHECK: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
+// CHECKELF: !DICompositeType(tag: DW_TAG_structure_type, name: "foo"
+// CHECKELF: !DICompositeType(tag: DW_TAG_class_type, name: "bar"
+// CHECKELF: !DICompositeType(tag: DW_TAG_union_type, name: "baz"
 
 // CHECK: [[D:![0-9]+]] = !DICompositeType(tag: DW_TAG_structure_type, name: "D"
 // CHECK-SAME:             size:
@@ -162,6 +170,10 @@ int main(int argc, char **argv) {
 // CHECK-NOT:              identifier:
 // CHECK-SAME:             ){{$}}
 
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_structure_type, name: "K"
+// CHECKCOFF-SAME:             identifier: "_ZTS1K"
+// CHECKCOFF-SAME:             ){{$}}
+
 // CHECK: [[L:![0-9]+]] = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "L"
 // CHECK-SAME:             ){{$}}
 // CHECK: [[L_FUNC_DECL:![0-9]*]] = !DISubprogram(name: "func",{{.*}} scope: [[L]]
diff --git a/clang/test/CodeGenCXX/debug-info-structured-binding.cpp b/clang/test/CodeGenCXX/debug-info-structured-binding.cpp
index 5fbd54c16382..4a4a4d8bdfaa 100644
--- a/clang/test/CodeGenCXX/debug-info-structured-binding.cpp
+++ b/clang/test/CodeGenCXX/debug-info-structured-binding.cpp
@@ -7,6 +7,12 @@
 // CHECK: #dbg_declare(ptr %{{[0-9]+}}, ![[VAR_4:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 4),
 // CHECK: #dbg_declare(ptr %z1, ![[VAR_5:[0-9]+]], !DIExpression()
 // CHECK: #dbg_declare(ptr %z2, ![[VAR_6:[0-9]+]], !DIExpression()
+// CHECK: getelementptr inbounds nuw %struct.A, ptr {{.*}}, i32 0, i32 1, !dbg ![[Y1_DEBUG_LOC:[0-9]+]]
+// CHECK: getelementptr inbounds nuw %struct.A, ptr {{.*}}, i32 0, i32 1, !dbg ![[Y2_DEBUG_LOC:[0-9]+]]
+// CHECK: load ptr, ptr %z2, {{.*}}!dbg ![[Z2_DEBUG_LOC:[0-9]+]]
+// CHECK: getelementptr inbounds [2 x i32], ptr {{.*}}, i64 0, i64 1, !dbg ![[A2_DEBUG_LOC:[0-9]+]]
+// CHECK: getelementptr inbounds nuw { i32, i32 }, ptr {{.*}}, i32 0, i32 1, !dbg ![[C2_DEBUG_LOC:[0-9]+]]
+// CHECK: extractelement <2 x i32> {{.*}}, i32 1, !dbg ![[V2_DEBUG_LOC:[0-9]+]]
 // CHECK: ![[VAR_0]] = !DILocalVariable(name: "a"
 // CHECK: ![[VAR_1]] = !DILocalVariable(name: "x1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
 // CHECK: ![[VAR_2]] = !DILocalVariable(name: "y1", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: {{[0-9]+}}, type: !{{[0-9]+}})
@@ -46,5 +52,41 @@ int f() {
   auto [x1, y1] = a;
   auto &[x2, y2] = a;
   auto [z1, z2] = B{1, 2};
-  return x1 + y1 + x2 + y2 + z1 + z2;
+  int array[2] = {3, 4};
+  auto &[a1, a2] = array;
+  _Complex int cmplx = {1, 2};
+  auto &[c1, c2] = cmplx;
+  int vctr __attribute__ ((vector_size (sizeof(int)*2)))= {1, 2};
+  auto &[v1, v2] = vctr;
+  return //
+     x1 //
+     +  //
+// CHECK: ![[Y1_DEBUG_LOC]] = !DILocation(line: [[@LINE+1]]
+     y1 //
+     +  //
+     x2 //
+     +  //
+// CHECK: ![[Y2_DEBUG_LOC]] = !DILocation(line: [[@LINE+1]]
+     y2 //
+     +  //
+     z1 //
+     +  //
+// CHECK: ![[Z2_DEBUG_LOC]] = !DILocation(line: [[@LINE+1]]
+     z2 //
+     +  //
+     a1 //
+     +  //
+// CHECK: ![[A2_DEBUG_LOC]] = !DILocation(line: [[@LINE+1]]
+     a2 //
+     +  //
+     c1 //
+     +  //
+// CHECK: ![[C2_DEBUG_LOC]] = !DILocation(line: [[@LINE+1]]
+     c2 //
+     +  //
+     v1 //
+     +  //
+// CHECK: ![[V2_DEBUG_LOC]] = !DILocation(line: [[@LINE+1]]
+     v2 //
+     ;
 }
diff --git a/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp b/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp
index 9a8ce1997a7f..19c2a9bd0497 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-exact-disabled.cpp
@@ -3,6 +3,7 @@
 // RUN: %clang_cc1 -I%S %s -triple x86_64-apple-darwin10 -O1 -fvisibility=hidden -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
 // RUN: %clang_cc1 -I%S %s -triple x86_64-apple-darwin10 -O1 -fapple-kext -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
 // RUN: %clang_cc1 -I%S %s -triple x86_64-apple-darwin10 -O1 -fno-assume-unique-vtables -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
+// RUN: %clang_cc1 -I%S %s -triple arm64e-apple-darwin10 -O1 -fptrauth-calls -emit-llvm -std=c++11 -o - | FileCheck %s --check-prefixes=CHECK,INEXACT
 
 struct A { virtual ~A(); };
 struct B final : A { };
diff --git a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
index 1b103719fbe4..e33525c1ec0f 100644
--- a/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-explicit-vtable-pointer-control.cpp
@@ -1,31 +1,31 @@
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple arm64-apple-ios   -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,NODISC %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,TYPE %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,ADDR %s
 
-// RUN: %clang_cc1 %s -x c++ -std=c++11 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
+// RUN: %clang_cc1 %s -x c++ -std=c++20 -triple aarch64-linux-gnu -fptrauth-calls -fptrauth-intrinsics \
 // RUN:   -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination \
 // RUN:   -emit-llvm -o - | FileCheck --check-prefixes=CHECK,BOTH %s
@@ -78,6 +78,27 @@ struct authenticated(default_key, default_address_discrimination, custom_discrim
   virtual void g();
 };
 
+// CHECK: @_ZTVN5test19ConstEvalE = external unnamed_addr constant { [3 x ptr] }, align 8
+// CHECK: @_ZN5test12ceE = global %{{.*}} { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test19ConstEvalE, i32 0, i32 0, i32 2), i32 2, i64 0, ptr @_ZN5test12ceE) }, align 8
+// CHECK: @_ZTVN5test116ConstEvalDerivedE = linkonce_odr unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTIN5test116ConstEvalDerivedE, ptr ptrauth (ptr @_ZN5test19ConstEval1fEv, i32 0, i64 26259, ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN5test116ConstEvalDerivedE, i32 0, i32 0, i32 2))] },{{.*}}align 8
+// CHECK: @_ZN5test13cedE = global { ptr } { ptr ptrauth (ptr getelementptr inbounds inrange(-16, 8) ({ [3 x ptr] }, ptr @_ZTVN5test116ConstEvalDerivedE, i32 0, i32 0, i32 2), i32 2, i64 0, ptr @_ZN5test13cedE) }, align 8
+
+struct authenticated(default_key, address_discrimination, no_extra_discrimination) ConstEval {
+  consteval ConstEval() {}
+  virtual void f();
+};
+
+// clang used to bail out with error message "could not emit constant value abstractly".
+ConstEval ce;
+
+struct ConstEvalDerived : public ConstEval {
+public:
+  consteval ConstEvalDerived() {}
+};
+
+// clang used to emit an undef initializer.
+ConstEvalDerived ced;
+
 template <typename T>
 struct SubClass : T {
   virtual void g();
diff --git a/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp b/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp
index 249586f5991f..b24ece159832 100644
--- a/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp
+++ b/clang/test/CodeGenCXX/vtable-debug-info-inheritance-simple.cpp
@@ -1,5 +1,3 @@
-// REQUIRES: target={{x86_64.*-linux.*}}
-
 // Simple inheritance case:
 // For CBase and CDerived we check:
 // - Generation of their vtables (including attributes).
@@ -30,13 +28,20 @@ int main() {
   return 0;
 }
 
-// RUN: %clang --target=x86_64-linux -Xclang -disable-O0-optnone -Xclang -disable-llvm-passes -emit-llvm -S -g %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux -emit-llvm -mrelocation-model pic -pic-is-pie -debug-info-kind=limited -dwarf-version=5 -disable-O0-optnone -disable-llvm-passes %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm -mrelocation-model pic -pic-is-pie -debug-info-kind=limited -dwarf-version=5 -disable-O0-optnone -disable-llvm-passes %s -o - | FileCheck %s --check-prefix=COFF
 
 // CHECK: $_ZTVN3NSP5CBaseE = comdat any
 // CHECK: $_ZTV8CDerived = comdat any
 
 // CHECK: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[BASE_VTABLE_VAR:![0-9]*]]
 // CHECK: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8, !dbg [[DERIVED_VTABLE_VAR:![0-9]*]]
+// COFF: @_ZTVN3NSP5CBaseE = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8
+// COFF-NOT: !dbg
+// COFF-SAME: {{$}}
+// COFF: @_ZTV8CDerived = linkonce_odr {{dso_local|hidden}} unnamed_addr constant {{.*}}, comdat, align 8
+// COFF-NOT: !dbg
+// COFF-SAME: {{$}}
 
 // CHECK: [[BASE_VTABLE_VAR]] = !DIGlobalVariableExpression(var: [[BASE_VTABLE:![0-9]*]], expr: !DIExpression())
 // CHECK-NEXT: [[BASE_VTABLE]] = distinct !DIGlobalVariable(name: "_vtable$", linkageName: "_ZTVN3NSP5CBaseE"
diff --git a/clang/test/CodeGenCoroutines/coro-gro.cpp b/clang/test/CodeGenCoroutines/coro-gro.cpp
index b62134317cef..037fd03349e7 100644
--- a/clang/test/CodeGenCoroutines/coro-gro.cpp
+++ b/clang/test/CodeGenCoroutines/coro-gro.cpp
@@ -106,4 +106,31 @@ invoker g() {
   // CHECK: call void @_ZN7invoker15invoker_promise17get_return_objectEv({{.*}} %[[AggRes]]
   co_return;
 }
-// CHECK: ![[OutFrameMetadata]] = !{}
\ No newline at end of file
+
+namespace gh148953 {
+
+struct Task {
+  struct promise_type {
+    Task get_return_object();
+    std::suspend_always initial_suspend() { return {}; }
+    std::suspend_always final_suspend() noexcept { return {}; }
+    void return_void() {}
+    void unhandled_exception() {}
+  };
+  Task() {}
+  // Different from `invoker`, this Task is copy constructible.
+  Task(const Task&) {};
+};
+
+// NRVO on const qualified return type should work.
+// CHECK: define{{.*}} void @_ZN8gh1489537exampleEv({{.*}} sret(%"struct.gh148953::Task") align 1 %[[NrvoRes:.+]])
+const Task example() {
+  // CHECK: %[[ResultPtr:.+]] = alloca ptr
+  // CHECK: store ptr %[[NrvoRes]], ptr %[[ResultPtr]]
+  // CHECK: coro.init:
+  // CHECK: call void @_ZN8gh1489534Task12promise_type17get_return_objectEv({{.*}} %[[NrvoRes:.+]], {{.*}})
+  co_return;
+}
+
+} // namespace gh148953
+// CHECK: ![[OutFrameMetadata]] = !{}
diff --git a/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
new file mode 100644
index 000000000000..b51670fd6459
--- /dev/null
+++ b/clang/test/CodeGenObjC/ptrauth-block-descriptor-pointer.m
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -fptrauth-block-descriptor-pointers -triple arm64e-apple-ios  -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -fobjc-arc -fblocks -fptrauth-calls -triple arm64e-apple-ios -DNO_BLOCK_DESC_AUTH -emit-llvm -o - %s | FileCheck %s --check-prefix=NODESCRIPTORAUTH
+
+#ifndef NO_BLOCK_DESC_AUTH
+_Static_assert(__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should set ptrauth_signed_block_descriptors");
+#else
+_Static_assert(!__has_feature(ptrauth_signed_block_descriptors), "-fptrauth-block-descriptor-pointers should not be enabled by default");
+#endif
+
+void a() {
+  // Test out a global block.
+  void (^blk)(void) = ^{};
+}
+
+// CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr ptrauth (ptr [[BLOCK_DESCRIPTOR_NAME]], i32 2, i64 49339, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 4)) }
+
+// NODESCRIPTORAUTH: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }
+// NODESCRIPTORAUTH: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr @_NSConcreteGlobalBlock, i32 1342177280, i32 0, ptr ptrauth (ptr @__a_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr [[BLOCK_DESCRIPTOR_NAME]] }
+
+
+void b(int p) {
+  // CHECK-LABEL: define void @b
+
+  // Test out a stack block.
+  void (^blk)(void) = ^{(void)p;};
+
+  // CHECK: [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32 }>
+  // CHECK: [[BLOCK_DESCRIPTOR_REF:%.*]] = getelementptr inbounds nuw <{ {{.*}} }>, ptr [[BLOCK]], i32 0, i32 4
+  // CHECK: [[BLOCK_DESCRIPTOR_REF_INT:%.*]] = ptrtoint ptr [[BLOCK_DESCRIPTOR_REF]] to i64
+  // CHECK: [[BLENDED:%.*]] = call i64 @llvm.ptrauth.blend(i64 [[BLOCK_DESCRIPTOR_REF_INT]], i64 49339)
+  // CHECK: [[SIGNED_REF:%.*]] = call i64 @llvm.ptrauth.sign(i64 ptrtoint (ptr @"__block_descriptor_36_e5_v8\01?0l" to i64), i32 2, i64 [[BLENDED]])
+  // CHECK: [[SIGNED_REF_PTR:%.*]] = inttoptr i64 [[SIGNED_REF]] to ptr
+  // CHECK: store ptr [[SIGNED_REF_PTR]], ptr [[BLOCK_DESCRIPTOR_REF]]
+
+  // NODESCRIPTORAUTH: [[BLOCK:%.*]] = alloca <{ ptr, i32, i32, ptr, ptr, i32 }>
+  // NODESCRIPTORAUTH: [[BLOCK_DESCRIPTOR_REF:%.*]] = getelementptr inbounds nuw <{ {{.*}} }>, ptr [[BLOCK]], i32 0, i32 4
+  // NODESCRIPTORAUTH: store ptr @"__block_descriptor_36_e5_v8\01?0l", ptr [[BLOCK_DESCRIPTOR_REF]]
+}
diff --git a/clang/test/CodeGenObjC/ptrauth-block-isa.m b/clang/test/CodeGenObjC/ptrauth-block-isa.m
index c1e98c6fd9d3..248e57769ba1 100644
--- a/clang/test/CodeGenObjC/ptrauth-block-isa.m
+++ b/clang/test/CodeGenObjC/ptrauth-block-isa.m
@@ -1,7 +1,8 @@
-// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s  -o - | FileCheck %s
+// RUN: %clang_cc1 -fptrauth-calls -fptrauth-objc-isa -fobjc-arc -fblocks -triple arm64e -emit-llvm %s -o - | FileCheck %s
 
 void (^globalblock)(void) = ^{};
-// CHECK: [[GLOBAL_BLOCK:@.*]] = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr [[GLOBAL_BLOCK]]), i32 1342177280, i32 0, ptr @globalblock_block_invoke, ptr @"__block_descriptor_32_e5_v8\01?0l" }, align 8 #0
+// CHECK: [[BLOCK_DESCRIPTOR_NAME:@"__block_descriptor_.*"]] = linkonce_odr hidden unnamed_addr constant { i64, i64, ptr, ptr } { i64 0, i64 32, ptr @.str, ptr null }, comdat, align 8
+// CHECK: @__block_literal_global = internal constant { ptr, i32, i32, ptr, ptr } { ptr ptrauth (ptr @_NSConcreteGlobalBlock, i32 2, i64 27361, ptr @__block_literal_global), i32 1342177280, i32 0, ptr ptrauth (ptr @globalblock_block_invoke, i32 0, i64 0, ptr getelementptr inbounds ({ ptr, i32, i32, ptr, ptr }, ptr @__block_literal_global, i32 0, i32 3)), ptr [[BLOCK_DESCRIPTOR_NAME]] }
 
 @interface A
 - (int) count;
diff --git a/clang/test/DebugInfo/KeyInstructions/asm.c b/clang/test/DebugInfo/KeyInstructions/asm.c
new file mode 100644
index 000000000000..2b3301660f7b
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/asm.c
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64 -target-feature +ls64 -O0 -emit-llvm -x c %s -o - -gkey-instructions -debug-info-kind=line-tables-only -gno-column-info | FileCheck %s
+// Partially copied from clang/test/CodeGen/AArch64/ls64-inline-asm.c
+
+// Check the inline asm call and result store are Key and distinct atoms.
+
+struct foo { unsigned long long x[8]; };
+// CHECK-LABEL: define dso_local void @load(
+// CHECK-SAME: ptr noundef [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG5:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUTPUT_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 8, !dbg [[DBG9:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8, !dbg [[DBG9]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i512 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[TMP1]]) #[[ATTR1:[0-9]+]], !dbg [[DBG10:![0-9]+]], !srcloc [[META11:![0-9]+]]
+// CHECK-NEXT:    store i512 [[TMP2]], ptr [[TMP0]], align 8, !dbg [[DBG12:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG13:![0-9]+]]
+//
+void load(struct foo *output, void *addr) {
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+
+// CHECK-LABEL: define dso_local void @load2(
+// CHECK-SAME: ptr noundef [[OUTPUT:%.*]], ptr noundef [[ADDR:%.*]]) #[[ATTR0]] !dbg [[DBG14:![0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[OUTPUT_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ADDR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[OUTPUT]], ptr [[OUTPUT_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ADDR]], ptr [[ADDR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[OUTPUT_ADDR]], align 8, !dbg [[DBG15:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ADDR_ADDR]], align 8, !dbg [[DBG15]]
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 asm sideeffect "ld64b $0,[$1]", "=r,r,~{memory}"(ptr [[TMP1]]) #[[ATTR1]], !dbg [[DBG16:![0-9]+]], !srcloc [[META17:![0-9]+]]
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP0]], align 4, !dbg [[DBG18:![0-9]+]]
+// CHECK-NEXT:    ret void, !dbg [[DBG19:![0-9]+]]
+//
+void load2(int *output, void *addr) {
+    __asm__ volatile ("ld64b %0,[%1]" : "=r" (*output) : "r" (addr) : "memory");
+}
+//.
+// CHECK: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C11, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+// CHECK: [[META1]] = !DIFile(filename: "{{.*}}<stdin>", directory: {{.*}})
+// CHECK: [[DBG5]] = distinct !DISubprogram(name: "load", scope: [[META6:![0-9]+]], file: [[META6]], line: 21, type: [[META7:![0-9]+]], scopeLine: 21, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], keyInstructions: true)
+// CHECK: [[META6]] = !DIFile(filename: "{{.*}}asm.c", directory: {{.*}})
+// CHECK: [[META7]] = !DISubroutineType(types: [[META8:![0-9]+]])
+// CHECK: [[META8]] = !{}
+// CHECK: [[DBG9]] = !DILocation(line: 22, scope: [[DBG5]])
+// CHECK: [[DBG10]] = !DILocation(line: 22, scope: [[DBG5]], atomGroup: 1, atomRank: 1)
+// CHECK: [[META11]] = !{i64 1458}
+// CHECK: [[DBG12]] = !DILocation(line: 22, scope: [[DBG5]], atomGroup: 2, atomRank: 1)
+// CHECK: [[DBG13]] = !DILocation(line: 23, scope: [[DBG5]], atomGroup: 3, atomRank: 1)
+// CHECK: [[DBG14]] = distinct !DISubprogram(name: "load2", scope: [[META6]], file: [[META6]], line: 38, type: [[META7]], scopeLine: 38, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: [[META0]], keyInstructions: true)
+// CHECK: [[DBG15]] = !DILocation(line: 39, scope: [[DBG14]])
+// CHECK: [[DBG16]] = !DILocation(line: 39, scope: [[DBG14]], atomGroup: 1, atomRank: 1)
+// CHECK: [[META17]] = !{i64 2501}
+// CHECK: [[DBG18]] = !DILocation(line: 39, scope: [[DBG14]], atomGroup: 2, atomRank: 1)
+// CHECK: [[DBG19]] = !DILocation(line: 40, scope: [[DBG14]], atomGroup: 3, atomRank: 1)
+//.
diff --git a/clang/test/DebugInfo/KeyInstructions/goto.c b/clang/test/DebugInfo/KeyInstructions/goto.c
new file mode 100644
index 000000000000..ead92e600ca5
--- /dev/null
+++ b/clang/test/DebugInfo/KeyInstructions/goto.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions -x c++ -std=c++17 %s -debug-info-kind=line-tables-only -emit-llvm -o - -gno-column-info \
+// RUN: | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -gkey-instructions -x c %s -debug-info-kind=line-tables-only -emit-llvm -o - -gno-column-info \
+// RUN: | FileCheck %s
+
+// Check the goto branches get Key Instructions metadata.
+void ext();
+void test_goto(void) {
+// CHECK: br label %dst1, !dbg [[G1R1:!.*]]
+  goto dst1;
+dst1:
+  ext();
+
+  void *ptr = &&dst2;
+// CHECK: br label %indirectgoto, !dbg [[G3R1:!.*]]
+  goto *ptr;
+dst2:
+  ext();
+
+// CHECK: br label %dst3, !dbg [[G4R1:!.*]]
+  goto *&&dst3;
+dst3:
+  ext();
+
+  return;
+}
+
+// CHECK: [[G1R1]] = !DILocation(line: 10, scope: ![[#]], atomGroup: 1, atomRank: 1)
+// CHECK: [[G3R1]] = !DILocation(line: 16, scope: ![[#]], atomGroup: 3, atomRank: 1)
+// CHECK: [[G4R1]] = !DILocation(line: 21, scope: ![[#]], atomGroup: 4, atomRank: 1)
diff --git a/clang/test/Driver/DTLTO/dtlto.c b/clang/test/Driver/DTLTO/dtlto.c
new file mode 100644
index 000000000000..96795d9a4e6a
--- /dev/null
+++ b/clang/test/Driver/DTLTO/dtlto.c
@@ -0,0 +1,48 @@
+// REQUIRES: lld
+
+/// Check DTLTO options are forwarded to the linker.
+
+/// Check that options are forwarded as expected with --thinlto-distributor=.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
+// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=FORWARD
+
+// FORWARD: ld.lld
+// FORWARD-SAME: "--thinlto-distributor=d.exe"
+// FORWARD-SAME: "--thinlto-remote-compiler={{[^"]+}}"
+// FORWARD-SAME: "--thinlto-distributor-arg=a1"
+// FORWARD-SAME: "--thinlto-distributor-arg=a2"
+// FORWARD-SAME: "--thinlto-distributor-arg=a3"
+
+/// Check that options are not added without --thinlto-distributor= and
+/// that a warning is issued for unused -Xthinlto-distributor options.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=NODIST --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a1'
+// NODIST: warning: argument unused during compilation: '-Xthinlto-distributor=a2,a3'
+// NODIST: ld.lld
+
+/// Check the expected arguments are forwarded by default with only
+/// --thinlto-distributor=.
+// RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -fthinlto-distributor=d.exe -Werror 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=DEFAULT --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// DEFAULT: ld.lld
+// DEFAULT-SAME: "--thinlto-distributor=d.exe"
+// DEFAULT-SAME: "--thinlto-remote-compiler={{.*}}clang{{[^\"]*}}"
+
+/// Check that nothing is forwarded when the compiler is not in LTO mode, and that
+/// appropriate unused option warnings are issued.
+// RUN: %clang %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
+// RUN:   -fthinlto-distributor=d.exe 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=NOFLTO --implicit-check-not=distributor \
+// RUN:     --implicit-check-not=remote-compiler
+
+// NOFLTO: warning: argument unused during compilation: '-fthinlto-distributor=d.exe'
+// NOFLTO: ld.lld
diff --git a/clang/test/Driver/aarch64-toolchain.c b/clang/test/Driver/aarch64-toolchain.c
index cfad4b8eb682..512b5a883cd5 100644
--- a/clang/test/Driver/aarch64-toolchain.c
+++ b/clang/test/Driver/aarch64-toolchain.c
@@ -11,7 +11,7 @@
 // LLD-AARCH64-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
 // LLD-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64linux" "-EL"
+// LLD-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64elf" "-EL"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // LLD-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // LLD-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -30,7 +30,7 @@
 // C-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
 // C-AARCH64-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// C-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64linux" "-EL"
+// C-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64elf" "-EL"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -47,7 +47,7 @@
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "aarch64-unknown-none-elf"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64linux" "-EL"
+// C-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64elf" "-EL"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // C-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -67,7 +67,7 @@
 // CXX-AARCH64-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
 // CXX-AARCH64-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -86,7 +86,7 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/8.2.1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -105,7 +105,7 @@
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "--sysroot={{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf"
-// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL-LIBCXX: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
@@ -122,7 +122,7 @@
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include/c++/v1"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/include"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../bin/aarch64-none-elf-ld"
-// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/../../../../aarch64-none-elf/lib/crt0.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1/crtbegin.o"
 // CXX-AARCH64-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_aarch64_gcc_tree/lib/gcc/aarch64-none-elf/8.2.1"
diff --git a/clang/test/Driver/arch-arm64e.c b/clang/test/Driver/arch-arm64e.c
index 0fb12d4dcc5e..39006d203989 100644
--- a/clang/test/Driver/arch-arm64e.c
+++ b/clang/test/Driver/arch-arm64e.c
@@ -2,11 +2,20 @@
 
 // RUN: %clang -target arm64-apple-darwin -c %s -### 2>&1 | FileCheck %s --check-prefix NONE
 // NONE: "-cc1"
-// NONE-NOT: "-fptrauth-intrinsics"
+
 // NONE-NOT: "-fptrauth-calls"
 // NONE-NOT: "-fptrauth-returns"
+// NONE-NOT: "-fptrauth-intrinsics"
 // NONE-NOT: "-fptrauth-indirect-gotos"
 // NONE-NOT: "-fptrauth-auth-traps"
+// NONE-NOT: "-fptrauth-vtable-pointer-address-discrimination"
+// NONE-NOT: "-fptrauth-vtable-pointer-type-discrimination"
+// NONE-NOT: "-fptrauth-objc-isa"
+// NONE-NOT: "-fptrauth-objc-class-ro"
+// NONE-NOT: "-fptrauth-objc-interface-sel"
+
+// Final catch all if any new flags are added
+// NONE-NOT: "-fptrauth"
 
 // RUN: %clang -target arm64-apple-darwin -fptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix CALL
 // CALL: "-cc1"{{.*}} {{.*}} "-fptrauth-calls"
@@ -23,39 +32,39 @@
 // RUN: %clang -target arm64-apple-darwin -fptrauth-auth-traps -c %s -### 2>&1 | FileCheck %s --check-prefix TRAPS
 // TRAPS: "-cc1"{{.*}} {{.*}} "-fptrauth-auth-traps"
 
-
 // Check the arm64e defaults.
 
 // RUN: %clang -target arm64e-apple-ios -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT
+// RUN: %clang -target arm64e-apple-macos -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULTMAC
+// RUN: %if system-darwin && target={{.*}}-{{darwin|macos}}{{.*}} %{ %clang -target arm64e-apple-macos -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULTARCH %}
 // RUN: %clang -mkernel -target arm64e-apple-ios -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT
 // RUN: %clang -fapple-kext -target arm64e-apple-ios -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT
-// DEFAULT: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// DEFAULT: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// DEFAULTMAC: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-m1"{{.*}}
+// DEFAULTARCH: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel"
 
 // RUN: %clang -target arm64e-apple-none-macho -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-MACHO
-// DEFAULT-MACHO: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// DEFAULT-MACHO: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"{{.*}}
 
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-NOCALL
-// RUN: %clang -mkernel -target arm64e-apple-ios -fno-ptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-NOCALL
-// RUN: %clang -fapple-kext -target arm64e-apple-ios -fno-ptrauth-calls -c %s -### 2>&1 | FileCheck %s --check-prefix DEFAULT-NOCALL
 // DEFAULT-NOCALL-NOT: "-fptrauth-calls"
-// DEFAULT-NOCALL: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"
+// DEFAULT-NOCALL: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-returns -c %s -### 2>&1 | FileCheck %s --check-prefix NORET
 
 // NORET-NOT: "-fptrauth-returns"
-// NORET: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"
+// NORET: "-fptrauth-calls" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-intrinsics -c %s -### 2>&1 | FileCheck %s --check-prefix NOINTRIN
 
-// NOINTRIN: "-fptrauth-returns"
 // NOINTRIN-NOT: "-fptrauth-intrinsics"
-// NOINTRIN: "-fptrauth-calls" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" {{.*}}"-target-cpu" "apple-a12"{{.*}}
+// NOINTRIN: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-indirect-gotos" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 
 // RUN: %clang -target arm64e-apple-ios -fno-ptrauth-auth-traps -c %s -### 2>&1 | FileCheck %s --check-prefix NOTRAP
-// NOTRAP: "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-indirect-gotos" {{.*}}"-target-cpu" "apple-a12"
+// NOTRAP: "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-intrinsics" "-fptrauth-indirect-gotos" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-objc-isa" "-fptrauth-objc-class-ro" "-fptrauth-objc-interface-sel" {{.*}}"-target-cpu" "apple-a12"
 
 
 // Check the CPU defaults and overrides.
diff --git a/clang/test/Driver/arm-toolchain.c b/clang/test/Driver/arm-toolchain.c
index c367594b0a75..9005992f2b75 100644
--- a/clang/test/Driver/arm-toolchain.c
+++ b/clang/test/Driver/arm-toolchain.c
@@ -10,7 +10,7 @@
 // LLD-ARM-BAREMETAL: "-isysroot" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
 // LLD-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/lld/ld.lld"
-// LLD-ARM-BAREMETAL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// LLD-ARM-BAREMETAL: "-Bstatic" "-m" "armelf" "-EL"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // LLD-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // LLD-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -29,7 +29,7 @@
 // C-ARM-BAREMETAL: "-internal-isystem" "{{.*}}Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
 // C-ARM-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// C-ARM-BAREMETAL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// C-ARM-BAREMETAL: "-Bstatic" "-m" "armelf" "-EL"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -46,7 +46,7 @@
 // C-ARM-BAREMETAL-NOSYSROOT: "-cc1" "-triple" "thumbv6m-unknown-none-eabi"
 // C-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// C-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf" "-EL"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // C-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -67,7 +67,7 @@
 // CXX-ARM-BAREMETAL: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
 // CXX-ARM-BAREMETAL: "--sysroot={{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -87,7 +87,7 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/8.2.1"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -106,7 +106,7 @@
 // CXX-ARM-BAREMETAL-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
 // CXX-ARM-BAREMETAL-LIBCXX: "--sysroot={{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi"
-// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL-LIBCXX: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
@@ -123,7 +123,7 @@
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include/c++/v1"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-internal-isystem" "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/include"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../bin/armv6m-none-eabi-ld"
-// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-Bstatic" "-m" "armelf" "-EL"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/../../../../armv6m-none-eabi/lib/crt0.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1/crtbegin.o"
 // CXX-ARM-BAREMETAL-NOSYSROOT-LIBCXX: "-L{{.*}}/Inputs/basic_arm_gcc_tree/lib/gcc/armv6m-none-eabi/8.2.1"
diff --git a/clang/test/Driver/baremetal-multilib-includedirs-error.yaml b/clang/test/Driver/baremetal-multilib-includedirs-error.yaml
new file mode 100644
index 000000000000..781f5f907f8f
--- /dev/null
+++ b/clang/test/Driver/baremetal-multilib-includedirs-error.yaml
@@ -0,0 +1,30 @@
+# Downstream issue: #446 (Extend the Multilib system to support an IncludeDirs field)
+# REQUIRES: shell
+# UNSUPPORTED: system-windows
+
+# This test demonstrates the new "IncludeDirs" field.
+# This field allows specifying include directories through
+# multilib.yaml configuration using relative paths.
+# Absolute paths should be rejected by the driver
+# with an appropriate diagnostic message.
+#
+# This test verifies that passing an absolute path
+# (/include) to IncludeDirs triggers the expected
+# error.
+
+# RUN: not %clang --target=aarch64-none-elf --multi-lib-config=%s %s -o /dev/null 2>&1 \
+# RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+# CHECK-ERROR: error: paths must be relative but "/include" starts with "/"
+
+---
+MultilibVersion: 1.0
+Variants:
+- Dir: aarch64-none-elf/aarch64a_exn_rtti_unaligned
+  Flags:
+  - --target=aarch64-unknown-none-elf
+  - -munaligned-access
+  IncludeDirs:
+  - /include
+  - aarch64-none-elf/include
+
+...
diff --git a/clang/test/Driver/baremetal-multilib-includedirs.yaml b/clang/test/Driver/baremetal-multilib-includedirs.yaml
new file mode 100644
index 000000000000..dc3f8aca5deb
--- /dev/null
+++ b/clang/test/Driver/baremetal-multilib-includedirs.yaml
@@ -0,0 +1,33 @@
+# Downstream issue: #446 (Extend the Multilib system to support an IncludeDirs field)
+# REQUIRES: shell
+# UNSUPPORTED: system-windows
+
+# This test demonstrates the new "IncludeDirs" field.
+# This field allows specifying include directories through
+# multilib.yaml configuration. When this field is present
+# it is appended to the sysroot during header path
+# resolution ignoring the default bare-metal assumptions.
+
+# RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -x c++ %s -### -o %t.out 2>&1 \
+# RUN:     --target=thumbv7m-none-eabi -mfloat-abi=soft --sysroot= \
+# RUN:   | FileCheck %s
+# CHECK:      "-cc1" "-triple" "thumbv7m-unknown-none-eabi"
+# CHECK-SAME: "-internal-isystem" "[[SYSROOT:[^"]*]]/bin/../lib/clang-runtimes/include/c++/v1"
+# CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/include-libunwind/c++/v1"
+# CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/soft/include/c++/v1"
+# CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/include"
+# CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/include-libunwind"
+# CHECK-SAME: "-internal-isystem" "[[SYSROOT]]/bin/../lib/clang-runtimes/soft/include"
+
+---
+MultilibVersion: 1.0
+Variants:
+- Dir: soft
+  Flags:
+    - -mfloat-abi=soft
+  IncludeDirs:
+    - include
+    - include-libunwind
+    - soft/include
+
+...
diff --git a/clang/test/Driver/baremetal.cpp b/clang/test/Driver/baremetal.cpp
index 4dc320191317..8b5ab4355087 100644
--- a/clang/test/Driver/baremetal.cpp
+++ b/clang/test/Driver/baremetal.cpp
@@ -17,7 +17,7 @@
 // CHECK-V6M-C-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-V6M-C-NEXT: ld{{(.exe)?}}"
 // CHECK-V6M-C-SAME: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-C-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-C-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-C-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-C-SAME: "-T" "semihosted.lds" "-Lsome{{[/\\]+}}directory{{[/\\]+}}user{{[/\\]+}}asked{{[/\\]+}}for"
 // CHECK-V6M-C-SAME: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
@@ -43,7 +43,7 @@
 // CHECK-V6M-TREE-SAME: "-internal-isystem" "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}include{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-V6M-TREE-NEXT: ld{{(.exe)?}}"
-// CHECK-V6M-TREE-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-TREE-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-TREE-SAME: "[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi{{[/\\]+}}crt0.o"
 // CHECK-V6M-TREE-SAME: "-L[[INSTALLED_DIR]]{{[/\\]+}}..{{[/\\]+}}lib{{[/\\]+}}armv6m-unknown-none-eabi"
 // CHECK-V6M-TREE-SAME "{{.*}}.o"
@@ -60,7 +60,7 @@
 // CHECK-ARMV7M-PER-TARGET: "-x" "c++" "{{.*}}baremetal.cpp"
 // CHECK-ARMV7M-PER-TARGET: ld{{(.exe)?}}"
 // CHECK-ARMV7M-PER-TARGET: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-ARMV7M-PER-TARGET: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-ARMV7M-PER-TARGET: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-ARMV7M-PER_TARGET: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-ARMV7M-PER-TARGET: "-L[[SYSROOT:[^"]+]]{{[/\\]+}}lib"
 // CHECK-ARMV7M-PER-TARGET: "-L[[RESOURCE_DIR:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}armv7m-vendor-none-eabi
@@ -73,7 +73,7 @@
 // CHECK-V6M-DEFAULTCXX: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-DEFAULTCXX: ld{{(.exe)?}}"
 // CHECK-V6M-DEFAULTCXX: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-DEFAULTCXX: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-DEFAULTCXX: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-DEFAULTCXX-SAME: "[[SYSROOT:[^"]+]]{{[/\\]+}}lib{{[/\\]+}}crt0.o"
 // CHECK-V6M-DEFAULTCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-DEFAULTCXX-SAME: "{{.*}}.o"
@@ -90,7 +90,7 @@
 // CHECK-V6M-LIBCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}v1"
 // CHECK-V6M-LIBCXX: ld{{(.exe)?}}"
 // CHECK-V6M-LIBCXX-SAME: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-LIBCXX-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-LIBCXX-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-LIBCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBCXX-SAME: "-lc++"
@@ -108,7 +108,7 @@
 // CHECK-V6M-LIBSTDCXX-SAME: "-internal-isystem" "{{[^"]+}}{{[/\\]+}}include{{[/\\]+}}c++{{[/\\]+}}6.0.0"
 // CHECK-V6M-LIBSTDCXX: ld{{(.exe)?}}"
 // CHECK-V6M-LIBSTDCXX-SAME: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-LIBSTDCXX-SAME: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-LIBSTDCXX-SAME: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-LIBSTDCXX-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 // CHECK-V6M-LIBSTDCXX-SAME: "{{.*}}.o"
 // CHECK-V6M-LIBSTDCXX-SAME: "-lstdc++" "-lm"
@@ -123,7 +123,7 @@
 // CHECK-V6M-NDL: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]"
 // CHECK-V6M-NDL: ld{{(.exe)?}}"
 // CHECK-V6M-NDL: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-V6M-NDL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-V6M-NDL: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-V6M-NDL-SAME: "-L{{[^"]*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm{{[/\\]+}}lib"
 
 // RUN: rm -rf %T/baremetal_cxx_sysroot
@@ -171,7 +171,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
 // CHECK-ARMV7EB: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ARMV7EB: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-ARMV7EB: "-Bstatic" "-m" "armelfb_linux_eabi" "--be8" "-EB"
+// CHECK-ARMV7EB: "-Bstatic" "-m" "armelfb" "--be8" "-EB"
 
 // RUN: %clang -### %s --target=armv7-none-eabi -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EB %s
@@ -183,7 +183,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARMV7EL %s
 // CHECK-ARMV7EL: "{{.*}}ld{{(.exe)?}}"
 // CHECK-ARMV7EL: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-ARMV7EL: "-Bstatic" "-m" "armelf_linux_eabi" "-EL"
+// CHECK-ARMV7EL: "-Bstatic" "-m" "armelf" "-EL"
 // CHECK-ARMV7EL-NOT: "--be8"
 
 // RUN: %clang -### %s --target=armebv7-none-eabi -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -196,7 +196,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64BE %s
 // CHECK-AARCH64BE: "{{.*}}ld{{(.exe)?}}"
 // CHECK-AARCH64BE: sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-AARCH64BE: "-Bstatic" "-m" "aarch64linuxb" "-EB"
+// CHECK-AARCH64BE: "-Bstatic" "-m" "aarch64elfb" "-EB"
 // CHECK-AARCH64BE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64-none-elf -mbig-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -209,7 +209,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-AARCH64LE %s
 // CHECK-AARCH64LE: "{{.*}}ld{{(.exe)?}}"
 // CHECK-AARCH64LE: "--sysroot={{.*}}{{[/\\]+}}Inputs{{[/\\]+}}baremetal_arm"
-// CHECK-AARCH64LE: "-Bstatic" "-m" "aarch64linux" "-EL"
+// CHECK-AARCH64LE: "-Bstatic" "-m" "aarch64elf" "-EL"
 // CHECK-AARCH64LE-NOT: "--be8"
 
 // RUN: %clang -### %s --target=aarch64_be-none-elf -mlittle-endian --sysroot=%S/Inputs/baremetal_arm 2>&1 \
@@ -257,7 +257,7 @@
 // CHECK-RV64-SAME:"{{.*}}.o"
 // CHECK-RV64-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-SAME: "-lc"
-// CHECK-RV64-SAME: "-X" "-o" "{{.*}}.tmp.out"
+// CHECK-RV64-SAME: "-o" "{{.*}}.tmp.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -271,7 +271,7 @@
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-DEFAULTCXX-SAME: "-lc"
-// CHECK-RV64-DEFAULTCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-DEFAULTCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### --target=riscv64-unknown-elf 2>&1 \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -288,7 +288,7 @@
 // CHECK-RV64-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV64-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBCXX-SAME: "-lc"
-// CHECK-RV64-LIBCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-LIBCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv64_tree/riscv64-unknown-elf \
@@ -305,7 +305,7 @@
 // CHECK-RV64-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV64-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV64-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV64-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV64-LIBSTDCXX-SAME: "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     -L some/directory/user/asked/for \
@@ -325,7 +325,7 @@
 // CHECK-RV32-SAME: "{{.*}}.o"
 // CHECK-RV32-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-SAME: "-lc"
-// CHECK-RV32-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -339,7 +339,7 @@
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-DEFAULTCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-DEFAULTCXX-SAME: "-lc"
-// CHECK-RV32-DEFAULTCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-DEFAULTCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -355,7 +355,7 @@
 // CHECK-RV32-LIBCXX-SAME: "{{.*}}.o"
 // CHECK-RV32-LIBCXX-SAME: "-lc++" "-lm"
 // CHECK-RV32-LIBCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
-// CHECK-RV32-LIBCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBCXX-SAME: "-o" "a.out"
 
 // RUN: %clangxx %s -### 2>&1 --target=riscv32-unknown-elf \
 // RUN:     --sysroot=%S/Inputs/basic_riscv32_tree/riscv32-unknown-elf \
@@ -372,7 +372,7 @@
 // CHECK-RV32-LIBSTDCXX-SAME: "-lstdc++" "-lm"
 // CHECK-RV32-LIBSTDCXX-SAME: "{{[^"]*}}libclang_rt.builtins.a"
 // CHECK-RV32-LIBSTDCXX-SAME: "-lc"
-// CHECK-RV32-LIBSTDCXX-SAME: "-X" "-o" "a.out"
+// CHECK-RV32-LIBSTDCXX-SAME: "-o" "a.out"
 
 // RUN: %clang %s -### 2>&1 --target=riscv64-unknown-elf \
 // RUN:     -nostdlibinc -nobuiltininc \
diff --git a/clang/test/Driver/mingw-msvcrt.c b/clang/test/Driver/mingw-msvcrt.c
index 340ce1f57b0f..e1648630476a 100644
--- a/clang/test/Driver/mingw-msvcrt.c
+++ b/clang/test/Driver/mingw-msvcrt.c
@@ -7,10 +7,10 @@
 // CHECK_DEFAULT: "-lmingwex" "-lmsvcrt" "-ladvapi32"
 // CHECK_DEFAULT-SAME: "-lmsvcrt" "-lkernel32" "{{.*}}crtend.o"
 // CHECK_MSVCR120: "-lmsvcr120"
-// CHECK_MSVCR120-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_MSVCR120-SAME: "-lmingwex" "-lmsvcr120" "-ladvapi32"
 // CHECK_UCRTBASE: "-lucrtbase"
-// CHECK_UCRTBASE-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_UCRTBASE-SAME: "-lmingwex" "-lucrtbase" "-ladvapi32"
 // CHECK_UCRT: "-lucrt"
-// CHECK_UCRT-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_UCRT-SAME: "-lmingwex" "-lucrt" "-ladvapi32"
 // CHECK_CRTDLL: "-lcrtdll"
-// CHECK_CRTDLL-SAME: "-lmingwex" "-ladvapi32"
+// CHECK_CRTDLL-SAME: "-lmingwex" "-lcrtdll" "-ladvapi32"
diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index 6639e9d2d9d6..1f12cfca9488 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -127,9 +127,12 @@
 // UNWIND-TABLES: "-funwind-tables=2"
 // NO-UNWIND-TABLES-NOT: "-funwind-tables=2"
 
-// Check that the -X and --no-relax flags are passed to the linker on riscv64
+// Check that the -X and --no-relax flags are passed to the linker
+// RUN: %clang --target=loongarch64-unknown-openbsd -mno-relax -### %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=LA64-FLAGS %s
 // RUN: %clang --target=riscv64-unknown-openbsd -mno-relax -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=RISCV64-FLAGS %s
+// LA64-FLAGS: "-X" "--no-relax"
 // RISCV64-FLAGS: "-X" "--no-relax"
 
 // Check passing LTO flags to the linker
diff --git a/clang/test/Driver/print-multi-selection-flags.c b/clang/test/Driver/print-multi-selection-flags.c
index 5f9383fbed8f..c91d06c27df3 100644
--- a/clang/test/Driver/print-multi-selection-flags.c
+++ b/clang/test/Driver/print-multi-selection-flags.c
@@ -107,3 +107,22 @@
 // CHECK-AARCH64-MULTILIB-CUSTOM-FLAG: --target=aarch64-unknown-none-eabi
 // CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=foo
 // CHECK-MULTILIB-CUSTOM-FLAG-DAG:     -fmultilib-flag=bar
+
+// Start downstream change. Issue #528
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -Os | FileCheck --check-prefix=CHECK-OPT-OS %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -Oz | FileCheck --check-prefix=CHECK-OPT-OZ %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a     | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -O1 | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -O2 | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=arm-none-eabi -march=armv7a -O3 | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -Os           | FileCheck --check-prefix=CHECK-OPT-OS %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -Oz           | FileCheck --check-prefix=CHECK-OPT-OZ %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi               | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -O1           | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -O2           | FileCheck --check-prefix=CHECK-OPT %s
+// RUN: %clang -multi-lib-config=%S/Inputs/multilib/empty.yaml -print-multi-flags-experimental --target=aarch64-none-eabi -O3           | FileCheck --check-prefix=CHECK-OPT %s
+// CHECK-OPT-OZ: -Oz
+// CHECK-OPT-OS: -Os
+// CHECK-OPT-NOT: -Oz
+// CHECK-OPT-NOT: -Os
+// End downstream change. Issue #528
diff --git a/clang/test/Driver/sparc-target-features.c b/clang/test/Driver/sparc-target-features.c
index a839604ff1bc..bd17da112bbd 100644
--- a/clang/test/Driver/sparc-target-features.c
+++ b/clang/test/Driver/sparc-target-features.c
@@ -20,6 +20,11 @@
 
 // RUN: %clang --target=sparc -mvis2 %s -### 2>&1 | FileCheck -check-prefix=VIS2 %s
 // RUN: %clang --target=sparc -mno-vis2 %s -### 2>&1 | FileCheck -check-prefix=NO-VIS2 %s
+/// Solaris/SPARC defaults to -mvis2
+// RUN: %clang --target=sparc-sun-solaris2.11 %s -### 2>&1 | FileCheck -check-prefix=VIS2 %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -mno-vis2 %s -### 2>&1 | FileCheck -check-prefix=NO-VIS2 %s
+// RUN: %clang --target=sparcv9-sun-solaris2.11 %s -### 2>&1 | FileCheck -check-prefix=VIS2 %s
+// RUN: %clang --target=sparcv9-sun-solaris2.11 -mno-vis2 %s -### 2>&1 | FileCheck -check-prefix=NO-VIS2 %s
 // VIS2: "-target-feature" "+vis2"
 // NO-VIS2: "-target-feature" "-vis2"
 
@@ -34,4 +39,8 @@
 // SOFT-QUAD-FLOAT: "-target-feature" "-hard-quad-float"
 
 // RUN: %clang --target=sparc -mv8plus %s -### 2>&1 | FileCheck -check-prefix=V8PLUS %s
+/// 32-bit Solaris/SPARC defaults to -mv8plus
+// RUN: %clang --target=sparc-sun-solaris2.11 %s -### 2>&1 | FileCheck -check-prefix=V8PLUS %s
+// RUN: %clang --target=sparc-sun-solaris2.11 -mno-v8plus %s -### 2>&1 | FileCheck -check-prefix=NO-V8PLUS %s
 // V8PLUS: "-target-feature" "+v8plus"
+// NO-V8PLUS-NOT: "-target-feature" "+v8plus"
diff --git a/clang/test/Modules/ExtDebugInfo.cpp b/clang/test/Modules/ExtDebugInfo.cpp
index 184973bc1783..3e74e2291d5e 100644
--- a/clang/test/Modules/ExtDebugInfo.cpp
+++ b/clang/test/Modules/ExtDebugInfo.cpp
@@ -8,7 +8,7 @@
 // RUN:     -fmodule-format=obj -fimplicit-module-maps -DMODULES \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t-mod.ll
-// RUN: cat %t-mod.ll |  FileCheck %s
+// RUN: cat %t-mod.ll |  FileCheck %s --check-prefix=CHECK %if target={{.*-(win|mingw|cyg).*}} %{--check-prefix=CHECKCOFF%} %else %{--check-prefix=CHECKELF%}
 
 // PCH:
 // RUN: %clang_cc1 -x c++ -std=c++11 -fmodule-format=obj -emit-pch -I%S/Inputs \
@@ -18,7 +18,7 @@
 // RUN:     -dwarf-ext-refs -fmodule-format=obj \
 // RUN:     -triple %itanium_abi_triple \
 // RUN:     -include-pch %t.pch %s -emit-llvm -o %t-pch.ll
-// RUN: cat %t-pch.ll |  FileCheck %s
+// RUN: cat %t-pch.ll |  FileCheck %s --check-prefix=CHECK %if target={{.*-(win|mingw|cyg).*}} %{--check-prefix=CHECKCOFF%} %else %{--check-prefix=CHECKELF%}
 // RUN: cat %t-pch.ll |  FileCheck %s --check-prefix=CHECK-PCH
 
 #ifdef MODULES
@@ -208,9 +208,9 @@ void foo() {
 // CHECK-SAME:              name: "InAnonymousNamespace", {{.*}}DIFlagFwdDecl)
 
 // There is a full definition of the type available in the module.
-// CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual",
-// CHECK-SAME:             DIFlagFwdDecl
-// CHECK-SAME:             identifier: "_ZTS7Virtual")
+// CHECKELF: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual",
+// CHECKELF-SAME:             DIFlagFwdDecl
+// CHECKELF-SAME:             identifier: "_ZTS7Virtual")
 
 // CHECK: !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !{{[0-9]+}}, entity: ![[STRUCT]], file: ![[CPP]], line: 50)
 
@@ -222,3 +222,8 @@ void foo() {
 
 // CHECK: !DICompositeType(tag: DW_TAG_class_type, name: "A",
 // CHECK-SAME:             DIFlagFwdDecl
+
+// There is a full definition of the type available in the module.
+// CHECKCOFF: !DICompositeType(tag: DW_TAG_structure_type, name: "Virtual",
+// CHECKCOFF-SAME:             DIFlagFwdDecl
+// CHECKCOFF-SAME:             identifier: "_ZTS7Virtual")
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index 4fead33bd826..125872a001ba 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -80,10 +80,10 @@
 // MIPS32BE:#define __INTMAX_MAX__ 9223372036854775807LL
 // MIPS32BE:#define __INTMAX_TYPE__ long long int
 // MIPS32BE:#define __INTMAX_WIDTH__ 64
-// MIPS32BE:#define __INTPTR_FMTd__ "ld"
-// MIPS32BE:#define __INTPTR_FMTi__ "li"
-// MIPS32BE:#define __INTPTR_MAX__ 2147483647L
-// MIPS32BE:#define __INTPTR_TYPE__ long int
+// MIPS32BE:#define __INTPTR_FMTd__ "d"
+// MIPS32BE:#define __INTPTR_FMTi__ "i"
+// MIPS32BE:#define __INTPTR_MAX__ 2147483647
+// MIPS32BE:#define __INTPTR_TYPE__ int
 // MIPS32BE:#define __INTPTR_WIDTH__ 32
 // MIPS32BE:#define __INT_FAST16_FMTd__ "hd"
 // MIPS32BE:#define __INT_FAST16_FMTi__ "hi"
@@ -185,8 +185,8 @@
 // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32BE:#define __UINTMAX_WIDTH__ 64
-// MIPS32BE:#define __UINTPTR_MAX__ 4294967295UL
-// MIPS32BE:#define __UINTPTR_TYPE__ long unsigned int
+// MIPS32BE:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32BE:#define __UINTPTR_TYPE__ unsigned int
 // MIPS32BE:#define __UINTPTR_WIDTH__ 32
 // MIPS32BE:#define __UINT_FAST16_MAX__ 65535
 // MIPS32BE:#define __UINT_FAST16_TYPE__ unsigned short
@@ -300,10 +300,10 @@
 // MIPS32EL:#define __INTMAX_MAX__ 9223372036854775807LL
 // MIPS32EL:#define __INTMAX_TYPE__ long long int
 // MIPS32EL:#define __INTMAX_WIDTH__ 64
-// MIPS32EL:#define __INTPTR_FMTd__ "ld"
-// MIPS32EL:#define __INTPTR_FMTi__ "li"
-// MIPS32EL:#define __INTPTR_MAX__ 2147483647L
-// MIPS32EL:#define __INTPTR_TYPE__ long int
+// MIPS32EL:#define __INTPTR_FMTd__ "d"
+// MIPS32EL:#define __INTPTR_FMTi__ "i"
+// MIPS32EL:#define __INTPTR_MAX__ 2147483647
+// MIPS32EL:#define __INTPTR_TYPE__ int
 // MIPS32EL:#define __INTPTR_WIDTH__ 32
 // MIPS32EL:#define __INT_FAST16_FMTd__ "hd"
 // MIPS32EL:#define __INT_FAST16_FMTi__ "hi"
@@ -402,8 +402,8 @@
 // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int
 // MIPS32EL:#define __UINTMAX_WIDTH__ 64
-// MIPS32EL:#define __UINTPTR_MAX__ 4294967295UL
-// MIPS32EL:#define __UINTPTR_TYPE__ long unsigned int
+// MIPS32EL:#define __UINTPTR_MAX__ 4294967295U
+// MIPS32EL:#define __UINTPTR_TYPE__ unsigned int
 // MIPS32EL:#define __UINTPTR_WIDTH__ 32
 // MIPS32EL:#define __UINT_FAST16_MAX__ 65535
 // MIPS32EL:#define __UINT_FAST16_TYPE__ unsigned short
@@ -547,10 +547,10 @@
 // MIPSN32BE: #define __INTMAX_MAX__ 9223372036854775807LL
 // MIPSN32BE: #define __INTMAX_TYPE__ long long int
 // MIPSN32BE: #define __INTMAX_WIDTH__ 64
-// MIPSN32BE: #define __INTPTR_FMTd__ "ld"
-// MIPSN32BE: #define __INTPTR_FMTi__ "li"
-// MIPSN32BE: #define __INTPTR_MAX__ 2147483647L
-// MIPSN32BE: #define __INTPTR_TYPE__ long int
+// MIPSN32BE: #define __INTPTR_FMTd__ "d"
+// MIPSN32BE: #define __INTPTR_FMTi__ "i"
+// MIPSN32BE: #define __INTPTR_MAX__ 2147483647
+// MIPSN32BE: #define __INTPTR_TYPE__ int
 // MIPSN32BE: #define __INTPTR_WIDTH__ 32
 // MIPSN32BE: #define __INT_FAST16_FMTd__ "hd"
 // MIPSN32BE: #define __INT_FAST16_FMTi__ "hi"
@@ -684,12 +684,12 @@
 // MIPSN32BE: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPSN32BE: #define __UINTMAX_TYPE__ long long unsigned int
 // MIPSN32BE: #define __UINTMAX_WIDTH__ 64
-// MIPSN32BE: #define __UINTPTR_FMTX__ "lX"
-// MIPSN32BE: #define __UINTPTR_FMTo__ "lo"
-// MIPSN32BE: #define __UINTPTR_FMTu__ "lu"
-// MIPSN32BE: #define __UINTPTR_FMTx__ "lx"
-// MIPSN32BE: #define __UINTPTR_MAX__ 4294967295UL
-// MIPSN32BE: #define __UINTPTR_TYPE__ long unsigned int
+// MIPSN32BE: #define __UINTPTR_FMTX__ "X"
+// MIPSN32BE: #define __UINTPTR_FMTo__ "o"
+// MIPSN32BE: #define __UINTPTR_FMTu__ "u"
+// MIPSN32BE: #define __UINTPTR_FMTx__ "x"
+// MIPSN32BE: #define __UINTPTR_MAX__ 4294967295U
+// MIPSN32BE: #define __UINTPTR_TYPE__ unsigned int
 // MIPSN32BE: #define __UINTPTR_WIDTH__ 32
 // MIPSN32BE: #define __UINT_FAST16_FMTX__ "hX"
 // MIPSN32BE: #define __UINT_FAST16_FMTo__ "ho"
@@ -864,10 +864,10 @@
 // MIPSN32EL: #define __INTMAX_MAX__ 9223372036854775807LL
 // MIPSN32EL: #define __INTMAX_TYPE__ long long int
 // MIPSN32EL: #define __INTMAX_WIDTH__ 64
-// MIPSN32EL: #define __INTPTR_FMTd__ "ld"
-// MIPSN32EL: #define __INTPTR_FMTi__ "li"
-// MIPSN32EL: #define __INTPTR_MAX__ 2147483647L
-// MIPSN32EL: #define __INTPTR_TYPE__ long int
+// MIPSN32EL: #define __INTPTR_FMTd__ "d"
+// MIPSN32EL: #define __INTPTR_FMTi__ "i"
+// MIPSN32EL: #define __INTPTR_MAX__ 2147483647
+// MIPSN32EL: #define __INTPTR_TYPE__ int
 // MIPSN32EL: #define __INTPTR_WIDTH__ 32
 // MIPSN32EL: #define __INT_FAST16_FMTd__ "hd"
 // MIPSN32EL: #define __INT_FAST16_FMTi__ "hi"
@@ -1001,12 +1001,12 @@
 // MIPSN32EL: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPSN32EL: #define __UINTMAX_TYPE__ long long unsigned int
 // MIPSN32EL: #define __UINTMAX_WIDTH__ 64
-// MIPSN32EL: #define __UINTPTR_FMTX__ "lX"
-// MIPSN32EL: #define __UINTPTR_FMTo__ "lo"
-// MIPSN32EL: #define __UINTPTR_FMTu__ "lu"
-// MIPSN32EL: #define __UINTPTR_FMTx__ "lx"
-// MIPSN32EL: #define __UINTPTR_MAX__ 4294967295UL
-// MIPSN32EL: #define __UINTPTR_TYPE__ long unsigned int
+// MIPSN32EL: #define __UINTPTR_FMTX__ "X"
+// MIPSN32EL: #define __UINTPTR_FMTo__ "o"
+// MIPSN32EL: #define __UINTPTR_FMTu__ "u"
+// MIPSN32EL: #define __UINTPTR_FMTx__ "x"
+// MIPSN32EL: #define __UINTPTR_MAX__ 4294967295U
+// MIPSN32EL: #define __UINTPTR_TYPE__ unsigned int
 // MIPSN32EL: #define __UINTPTR_WIDTH__ 32
 // MIPSN32EL: #define __UINT_FAST16_FMTX__ "hX"
 // MIPSN32EL: #define __UINT_FAST16_FMTo__ "ho"
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index bed39dc3e34d..7e0df9614136 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1622,6 +1622,14 @@
 // RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=amd64-unknown-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD-STDC-N %s
 // OPENBSD-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
 //
+// RUN: %clang_cc1 -x c -std=c11 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC %s
+// RUN: %clang_cc1 -x c -std=gnu11 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC %s
+// RUN: %clang_cc1 -x c -std=c17 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC %s
+// DRAGONFLY-STDC:#define __STDC_NO_THREADS__ 1
+//
+// RUN: %clang_cc1 -x c -std=c99 -E -dM -ffreestanding -triple=x86_64-unknown-dragonfly < /dev/null | FileCheck -match-full-lines -check-prefix DRAGONFLY-STDC-N %s
+// DRAGONFLY-STDC-N-NOT:#define __STDC_NO_THREADS__ 1
+//
 // RUN: %clang_cc1 -triple=aarch64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
 // RUN: %clang_cc1 -triple=riscv64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
 // RUN: %clang_cc1 -triple=x86_64-unknown-managarm-mlibc -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix MANAGARM %s
diff --git a/clang/test/Preprocessor/ptrauth_extension.c b/clang/test/Preprocessor/ptrauth_extension.c
index d6b79187ba62..3267b0786c28 100644
--- a/clang/test/Preprocessor/ptrauth_extension.c
+++ b/clang/test/Preprocessor/ptrauth_extension.c
@@ -4,10 +4,32 @@
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-calls | \
 // RUN:   FileCheck %s --check-prefixes=NOINTRIN
 
-#if __has_extension(ptrauth_qualifier)
-// INTRIN: has_ptrauth_qualifier
-void has_ptrauth_qualifier() {}
-#else
+// RUN: %clang_cc1 -E %s -DIS_DARWIN -triple=arm64e-apple-darwin -fptrauth-intrinsics | \
+// RUN:   FileCheck %s --check-prefixes=INTRIN,INTRIN_MAC
+
+// RUN: %clang_cc1 -E %s -DIS_DARWIN -triple=arm64e-apple-darwin -fptrauth-calls | \
+// RUN:   FileCheck %s --check-prefixes=NOINTRIN
+
+#if defined(IS_DARWIN) && __has_extension(ptrauth_qualifier)
+// INTRIN_MAC: has_ptrauth_qualifier1
+void has_ptrauth_qualifier1() {}
+#ifndef __PTRAUTH__
+#error ptrauth_qualifier extension present without predefined test macro
+#endif
+#endif
+#if defined(IS_DARWIN) && __has_feature(ptrauth_qualifier)
+// INTRIN_MAC: has_ptrauth_qualifier2
+void has_ptrauth_qualifier2() {}
+#ifndef __PTRAUTH__
+#error ptrauth_qualifier extension present without predefined test macro
+#endif
+#endif
+#if defined(__PTRAUTH__)
+// INTRIN: has_ptrauth_qualifier3
+void has_ptrauth_qualifier3() {}
+#endif
+
+#if !defined(__PTRAUTH__) && !__has_feature(ptrauth_qualifier) && !__has_extension(ptrauth_qualifier)
 // NOINTRIN: no_ptrauth_qualifier
 void no_ptrauth_qualifier() {}
 #endif
diff --git a/clang/test/Preprocessor/ptrauth_feature.c b/clang/test/Preprocessor/ptrauth_feature.c
index a440791d6cc6..cebea4188415 100644
--- a/clang/test/Preprocessor/ptrauth_feature.c
+++ b/clang/test/Preprocessor/ptrauth_feature.c
@@ -34,7 +34,7 @@
 // RUN: %clang_cc1 -E %s -triple=aarch64 -fptrauth-elf-got | \
 // RUN:   FileCheck %s --check-prefixes=NOINTRIN,NOCALLS,NORETS,NOVPTR_ADDR_DISCR,NOVPTR_TYPE_DISCR,NOTYPE_INFO_DISCR,NOFUNC,NOINITFINI,NOINITFINI_ADDR_DISCR,NOGOTOS,ELFGOT
 
-#if __has_feature(ptrauth_intrinsics)
+#if defined(__PTRAUTH__)
 // INTRIN: has_ptrauth_intrinsics
 void has_ptrauth_intrinsics() {}
 #else
@@ -130,3 +130,11 @@ void has_ptrauth_elf_got() {}
 // NOELFGOT: no_ptrauth_elf_got
 void no_ptrauth_elf_got() {}
 #endif
+
+#if __has_feature(ptrauth_objc_signable_class)
+// INTRIN: has_ptrauth_objc_signable_class
+void has_ptrauth_objc_signable_class(){}
+#else
+// NOINTRIN: no_ptrauth_objc_signable_class
+void no_ptrauth_objc_signable_class(){}
+#endif
diff --git a/clang/test/Preprocessor/stdint.c b/clang/test/Preprocessor/stdint.c
index 899ff59bf0b6..9f982a3a94fd 100644
--- a/clang/test/Preprocessor/stdint.c
+++ b/clang/test/Preprocessor/stdint.c
@@ -350,8 +350,8 @@
 // MIPS:typedef int8_t int_fast8_t;
 // MIPS:typedef uint8_t uint_fast8_t;
 //
-// MIPS:typedef long int intptr_t;
-// MIPS:typedef long unsigned int uintptr_t;
+// MIPS:typedef int intptr_t;
+// MIPS:typedef unsigned int uintptr_t;
 //
 // MIPS:typedef long long int intmax_t;
 // MIPS:typedef long long unsigned int uintmax_t;
@@ -396,9 +396,9 @@
 // MIPS:INT_FAST64_MAX_ 9223372036854775807LL
 // MIPS:UINT_FAST64_MAX_ 18446744073709551615ULL
 //
-// MIPS:INTPTR_MIN_ (-2147483647L -1)
-// MIPS:INTPTR_MAX_ 2147483647L
-// MIPS:UINTPTR_MAX_ 4294967295UL
+// MIPS:INTPTR_MIN_ (-2147483647 -1)
+// MIPS:INTPTR_MAX_ 2147483647
+// MIPS:UINTPTR_MAX_ 4294967295U
 // MIPS:PTRDIFF_MIN_ (-2147483647 -1)
 // MIPS:PTRDIFF_MAX_ 2147483647
 // MIPS:SIZE_MAX_ 4294967295U
diff --git a/clang/test/Sema/attr-nonstring.c b/clang/test/Sema/attr-nonstring.c
index 3838aa3bbee1..fe7b6d259dd7 100644
--- a/clang/test/Sema/attr-nonstring.c
+++ b/clang/test/Sema/attr-nonstring.c
@@ -229,3 +229,11 @@ struct Outer o2[] = {
     }
   }
 };
+
+// The attribute also works with a pointer type, not just an array type.
+__attribute__((nonstring)) char *ptr1;
+__attribute__((nonstring)) const unsigned char *ptr2;
+struct GH150951 {
+  __attribute__((nonstring)) char *ptr1;
+  __attribute__((nonstring)) const unsigned char *ptr2;
+};
diff --git a/clang/test/Sema/constexpr.c b/clang/test/Sema/constexpr.c
index 3dcb0b3a7d95..0987d175c91a 100644
--- a/clang/test/Sema/constexpr.c
+++ b/clang/test/Sema/constexpr.c
@@ -391,3 +391,10 @@ void ghissue109095() {
   _Static_assert(i == c[0]); // expected-error {{static assertion expression is not an integral constant expression}}\
                              // expected-note {{initializer of 'i' is not a constant expression}}
 }
+
+typedef bool __vbool2  __attribute__((ext_vector_type(2)));
+typedef short v2int16_t __attribute__((ext_vector_type(2)));
+
+bool issue155507(v2int16_t a, v2int16_t b) {
+    return __builtin_bit_cast(unsigned char, __builtin_convertvector(a == b, __vbool2)) == 0b11;
+}
diff --git a/clang/test/Sema/for.c b/clang/test/Sema/for.c
index e16169aac0c4..35c4720ef330 100644
--- a/clang/test/Sema/for.c
+++ b/clang/test/Sema/for.c
@@ -26,6 +26,5 @@ void b11 (void) { for (static _Thread_local struct { int i; } s;s.i;); } /* c11-
 #endif
 
 void b12(void) {
-  for(_Static_assert(1, "");;) {} /* c11-warning {{non-variable declaration in 'for' loop is a C23 extension}}
-                                     c23-warning {{non-variable declaration in 'for' loop is incompatible with C standards before C23}} */
+  for(_Static_assert(1, "");;) {} /* okay, _Static_assert declares *no* identifiers */
 }
diff --git a/clang/test/Sema/implicit-void-ptr-cast.c b/clang/test/Sema/implicit-void-ptr-cast.c
index 3c3e153d1dbd..d5e949719144 100644
--- a/clang/test/Sema/implicit-void-ptr-cast.c
+++ b/clang/test/Sema/implicit-void-ptr-cast.c
@@ -82,3 +82,15 @@ void more(void) {
   ptr3 = SOMETHING_THAT_IS_NOT_NULL; // c-warning {{implicit conversion when assigning to 'char *' from type 'void *' is not permitted in C++}} \
                                         cxx-error {{assigning to 'char *' from incompatible type 'void *'}}
 }
+
+void gh154157(void) {
+  #define ATOMIC_VAR_INIT(value) (value)
+
+  typedef const struct T * T_Ref;
+  static T_Ref _Atomic x = ATOMIC_VAR_INIT((void*)NULL); // c-warning {{implicit conversion when initializing '_Atomic(T_Ref)' with an expression of type 'void *' is not permitted in C++}} \
+                                                            cxx-error {{cannot initialize a variable of type '_Atomic(T_Ref)' with an rvalue of type 'void *'}}
+  static T_Ref const y = ATOMIC_VAR_INIT((void*)NULL);   // c-warning {{implicit conversion when initializing 'const T_Ref' (aka 'const struct T *const') with an expression of type 'void *' is not permitted in C++}} \
+                                                            cxx-error {{cannot initialize a variable of type 'const T_Ref' (aka 'const T *const') with an rvalue of type 'void *'}}
+  static T_Ref z = ATOMIC_VAR_INIT((void*)NULL);         // c-warning {{implicit conversion when initializing 'T_Ref' (aka 'const struct T *') with an expression of type 'void *' is not permitted in C++}} \
+                                                            cxx-error {{cannot initialize a variable of type 'T_Ref' (aka 'const T *') with an rvalue of type 'void *'}}
+}
diff --git a/clang/test/Sema/ptrauth-qualifier.c b/clang/test/Sema/ptrauth-qualifier.c
index 5d932b724f07..3e568ce9f37e 100644
--- a/clang/test/Sema/ptrauth-qualifier.c
+++ b/clang/test/Sema/ptrauth-qualifier.c
@@ -1,13 +1,25 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -DIS_DARWIN -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -std=c23 -fsyntax-only -verify -fptrauth-intrinsics %s
 
-#if !__has_extension(ptrauth_qualifier)
+#if defined(IS_DARWIN) && !__has_extension(ptrauth_qualifier)
 // This error means that the __ptrauth qualifier availability test says  that it
 // is not available. This error is not expected in the output, if it is seen
 // there is a feature detection regression.
 #error __ptrauth qualifier not enabled
 #endif
 
+#if defined(IS_DARWIN) && !__has_feature(ptrauth_qualifier)
+// This error means that the __has_feature test for ptrauth_qualifier has
+// failed, despite it being expected on darwin.
+#error __ptrauth qualifier not enabled
+#elif !defined(IS_DARWIN) && (__has_feature(ptrauth_qualifier) || __has_extension(ptrauth_qualifier))
+#error ptrauth_qualifier labeled a feature on a non-darwin platform
+#endif
+
+#if !defined (__PTRAUTH__)
+#error __PTRAUTH__ test macro not defined when ptrauth is enabled
+#endif
+
 #if __aarch64__
 #define VALID_CODE_KEY 0
 #define VALID_DATA_KEY 2
diff --git a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
index 38dfdb98f08f..a956386ae933 100644
--- a/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
+++ b/clang/test/Sema/warn-lifetime-safety-dataflow.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
+// RUN: %clang_cc1 -fexperimental-lifetime-safety -mllvm -debug-only=LifetimeFacts,LifetimeDataflow -Wexperimental-lifetime-safety %s 2>&1 | FileCheck %s
 // REQUIRES: asserts
 
 struct MyObj {
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index c390fee1c38d..935409131e18 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -2615,3 +2615,19 @@ namespace DoubleCapture {
     };
   }
 }
+
+namespace GH154567 {
+  struct T {
+    int i;
+  };
+
+  struct S {
+    struct { // expected-warning {{GNU extension}}
+      T val;
+    };
+    constexpr S() : val() {}
+  };
+
+  constexpr S s{};
+  static_assert(s.val.i == 0, "");
+}
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index e16a69df3830..e93b98c185a8 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1321,3 +1321,18 @@ constexpr bool check = different_in_loop();
   // expected-error@-1 {{}} expected-note@-1 {{in call}}
 
 }
+
+namespace comparison_dead_variable {
+  constexpr bool f() {
+    int *p1 = 0, *p2 = 0;
+    {
+        int x = 0; p1 = &x;
+    }
+    {
+        int x = 0; p2 = &x;
+    }
+    return p1 != p2;
+  }
+  // FIXME: This should fail.
+  static_assert(f(),"");
+}
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index dffb386f530f..312a77830420 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -319,7 +319,7 @@ namespace casting {
 }
 
 namespace pointer_comparisons {
-  extern int &extern_n; // interpreter-note 2 {{declared here}}
+  extern int &extern_n; // interpreter-note 4 {{declared here}}
   extern int &extern_n2;
   constexpr int f1(bool b, int& n) {
     if (b) {
@@ -330,14 +330,70 @@ namespace pointer_comparisons {
   // FIXME: interpreter incorrectly rejects; both sides are the same constexpr-unknown value.
   static_assert(f1(false, extern_n)); // interpreter-error {{static assertion expression is not an integral constant expression}} \
                                       // interpreter-note {{initializer of 'extern_n' is unknown}}
-  // FIXME: We should diagnose this: we don't know if the references bind
-  // to the same object.
-  static_assert(&extern_n != &extern_n2); // interpreter-error {{static assertion expression is not an integral constant expression}} \
+  static_assert(&extern_n != &extern_n2); // expected-error {{static assertion expression is not an integral constant expression}} \
+                                          // nointerpreter-note {{comparison between pointers to unrelated objects '&extern_n' and '&extern_n2' has unspecified value}} \
                                           // interpreter-note {{initializer of 'extern_n' is unknown}}
   void f2(const int &n) {
-    // FIXME: We should not diagnose this: the two objects provably have
-    // different addresses because the lifetime of "n" extends across
-    // the initialization.
-    constexpr int x = &x == &n; // nointerpreter-error {{must be initialized by a constant expression}}
+    constexpr int x = &x == &n; // nointerpreter-error {{must be initialized by a constant expression}} \
+                                // nointerpreter-note {{comparison between pointers to unrelated objects '&x' and '&n' has unspecified value}}
+    // Distinct variables are not equal, even if they're local variables.
+    constexpr int y = &x == &y;
+    static_assert(!y);
   }
+  constexpr int f3() {
+    int x;
+    return &x == &extern_n; // nointerpreter-note {{comparison between pointers to unrelated objects '&x' and '&extern_n' has unspecified value}} \
+                            // interpreter-note {{initializer of 'extern_n' is unknown}}
+  }
+  static_assert(!f3()); // expected-error {{static assertion expression is not an integral constant expression}} \
+                        // expected-note {{in call to 'f3()'}}
+  constexpr int f4() {
+    int *p = new int;
+    bool b = p == &extern_n; // nointerpreter-note {{comparison between pointers to unrelated objects '&{*new int#0}' and '&extern_n' has unspecified value}} \
+                             // interpreter-note {{initializer of 'extern_n' is unknown}}
+    delete p;
+    return b;
+  }
+  static_assert(!f4()); // expected-error {{static assertion expression is not an integral constant expression}} \
+                        // expected-note {{in call to 'f4()'}}
+}
+
+namespace GH149188 {
+namespace enable_if_1 {
+  template <__SIZE_TYPE__ N>
+  constexpr void foo(const char (&Str)[N])
+  __attribute((enable_if(__builtin_strlen(Str), ""))) {}
+
+  void x() {
+      foo("1234");
+  }
+}
+
+namespace enable_if_2 {
+  constexpr const char (&f())[];
+  extern const char (&Str)[];
+  constexpr int foo()
+  __attribute((enable_if(__builtin_strlen(Str), "")))
+  {return __builtin_strlen(Str);}
+
+  constexpr const char (&f())[] {return "a";}
+  constexpr const char (&Str)[] = f();
+  void x() {
+      constexpr int x = foo();
+  }
+}
+}
+
+namespace GH150015 {
+  extern int (& c)[8]; // interpreter-note {{declared here}}
+  constexpr int x = c <= c+8; // interpreter-error {{constexpr variable 'x' must be initialized by a constant expression}} \
+                              // interpreter-note {{initializer of 'c' is unknown}}
+
+  struct X {};
+  struct Y {};
+  struct Z : X, Y {};
+  extern Z &z; // interpreter-note{{declared here}}
+  constexpr int bases = (void*)(X*)&z <= (Y*)&z; // expected-error {{constexpr variable 'bases' must be initialized by a constant expression}} \
+                                                 // nointerpreter-note {{comparison of addresses of subobjects of different base classes has unspecified value}} \
+                                                 // interpreter-note {{initializer of 'z' is unknown}}
 }
diff --git a/clang/test/SemaCXX/constexpr-never-constant.cpp b/clang/test/SemaCXX/constexpr-never-constant.cpp
index 307810ee263d..5756bb647ce8 100644
--- a/clang/test/SemaCXX/constexpr-never-constant.cpp
+++ b/clang/test/SemaCXX/constexpr-never-constant.cpp
@@ -24,3 +24,10 @@ constexpr void other_func() {
 
   throw 12;
 }
+
+namespace GH149041 {
+  // Make sure these don't trigger the diagnostic.
+  extern const bool& b;
+  constexpr bool fun1() { return b; }
+  constexpr bool fun2(const bool& b) { return b; }
+}
diff --git a/clang/test/SemaCXX/noreturn-weverything.c b/clang/test/SemaCXX/noreturn-weverything.c
new file mode 100644
index 000000000000..92a587d39563
--- /dev/null
+++ b/clang/test/SemaCXX/noreturn-weverything.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -fsyntax-only %s -Weverything
+
+void free(void *);
+typedef void (*set_free_func)(void *);
+struct Method {
+  int nparams;
+  int *param;
+};
+void selelem_free_method(struct Method* method, void* data) {
+    set_free_func free_func = 0;
+    for (int i = 0; i < method->nparams; ++i)
+        free(&method->param[i]);
+    if (data && free_func)
+        free_func(data);
+}
diff --git a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
index 135865c8450f..46c367084852 100644
--- a/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
+++ b/clang/test/SemaCXX/overload-resolution-deferred-templates.cpp
@@ -283,31 +283,3 @@ void f() {
 }
 
 #endif
-
-namespace GH147374 {
-
-struct String {};
-template <typename T> void operator+(T, String &&) = delete;
-
-struct Bar {
-    void operator+(String) const; // expected-note {{candidate function}}
-    friend void operator+(Bar, String) {};  // expected-note {{candidate function}}
-};
-
-struct Baz {
-    void operator+(String); // expected-note {{candidate function}}
-    friend void operator+(Baz, String) {}; // expected-note {{candidate function}}
-};
-
-void test() {
-    Bar a;
-    String b;
-    a + b;
-    //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Bar' and 'String')}}
-
-    Baz z;
-    z + b;
-    //expected-error@-1 {{use of overloaded operator '+' is ambiguous (with operand types 'Baz' and 'String')}}
-}
-
-}
diff --git a/clang/test/SemaCXX/ptrauth-triviality.cpp b/clang/test/SemaCXX/ptrauth-triviality.cpp
index ba8a8273d5c0..b1b334b59a59 100644
--- a/clang/test/SemaCXX/ptrauth-triviality.cpp
+++ b/clang/test/SemaCXX/ptrauth-triviality.cpp
@@ -74,7 +74,7 @@ static_assert(__is_trivially_destructible(S3));
 static_assert(!__is_trivially_copyable(S3));
 static_assert(!__is_trivially_relocatable(S3)); // expected-warning{{deprecated}}
 //FIXME
-static_assert(__builtin_is_cpp_trivially_relocatable(S3));
+static_assert(!__builtin_is_cpp_trivially_relocatable(S3));
 static_assert(!__is_trivially_equality_comparable(S3));
 
 
@@ -84,7 +84,7 @@ static_assert(!__is_trivially_assignable(Holder<S3>, const Holder<S3>&));
 static_assert(__is_trivially_destructible(Holder<S3>));
 static_assert(!__is_trivially_copyable(Holder<S3>));
 static_assert(!__is_trivially_relocatable(Holder<S3>)); // expected-warning{{deprecated}}
-static_assert(__builtin_is_cpp_trivially_relocatable(Holder<S3>));
+static_assert(!__builtin_is_cpp_trivially_relocatable(Holder<S3>));
 static_assert(!__is_trivially_equality_comparable(Holder<S3>));
 
 struct IA S4 {
@@ -207,7 +207,7 @@ template <class T> struct UnionWrapper trivially_relocatable_if_eligible {
   } u;
 };
 
-static_assert(test_is_trivially_relocatable_v<AddressDiscriminatedPolymorphicBase>);
+static_assert(!test_is_trivially_relocatable_v<AddressDiscriminatedPolymorphicBase>);
 static_assert(test_is_trivially_relocatable_v<NoAddressDiscriminatedPolymorphicBase>);
 static_assert(inheritance_relocatability_matches_bases_v<AddressDiscriminatedPolymorphicBase, NoAddressDiscriminatedPolymorphicBase>);
 static_assert(inheritance_relocatability_matches_bases_v<NoAddressDiscriminatedPolymorphicBase, AddressDiscriminatedPolymorphicBase>);
diff --git a/clang/test/SemaCXX/ptrauth-type-traits.cpp b/clang/test/SemaCXX/ptrauth-type-traits.cpp
new file mode 100644
index 000000000000..aefbd63fa167
--- /dev/null
+++ b/clang/test/SemaCXX/ptrauth-type-traits.cpp
@@ -0,0 +1,401 @@
+// RUN: %clang_cc1 -triple arm64               -std=c++26 -Wno-deprecated-builtins \
+// RUN:                                        -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple arm64-apple-darwin -fptrauth-calls -fptrauth-intrinsics \
+// RUN:                                       -fptrauth-vtable-pointer-address-discrimination \
+// RUN:                                       -std=c++26 -Wno-deprecated-builtins \
+// RUN:                                       -fsyntax-only -verify %s
+
+// expected-no-diagnostics
+
+#ifdef __PTRAUTH__
+
+#define NonAddressDiscriminatedVTablePtrAttr \
+  [[clang::ptrauth_vtable_pointer(process_independent, no_address_discrimination, no_extra_discrimination)]]
+#define AddressDiscriminatedVTablePtrAttr \
+  [[clang::ptrauth_vtable_pointer(process_independent, address_discrimination, no_extra_discrimination)]]
+#define ADDR_DISC_ENABLED true
+#else
+#define NonAddressDiscriminatedVTablePtrAttr
+#define AddressDiscriminatedVTablePtrAttr
+#define ADDR_DISC_ENABLED false
+#define __ptrauth(...)
+#endif
+
+
+typedef int* __ptrauth(1,1,1) AddressDiscriminatedPtr;
+typedef __UINT64_TYPE__ __ptrauth(1,1,1) AddressDiscriminatedInt64;
+struct AddressDiscriminatedFields {
+  AddressDiscriminatedPtr ptr;
+};
+struct RelocatableAddressDiscriminatedFields trivially_relocatable_if_eligible {
+  AddressDiscriminatedPtr ptr;
+};
+struct AddressDiscriminatedFieldInBaseClass : AddressDiscriminatedFields {
+  void *newfield;
+};
+
+struct NonAddressDiscriminatedVTablePtrAttr NonAddressDiscriminatedVTablePtr {
+  virtual ~NonAddressDiscriminatedVTablePtr();
+  void *i;
+};
+
+struct NonAddressDiscriminatedVTablePtrAttr NonAddressDiscriminatedVTablePtr2 {
+  virtual ~NonAddressDiscriminatedVTablePtr2();
+  void *j;
+};
+
+struct NonAddressDiscriminatedVTablePtrAttr RelocatableNonAddressDiscriminatedVTablePtr trivially_relocatable_if_eligible {
+  virtual ~RelocatableNonAddressDiscriminatedVTablePtr();
+  void *i;
+};
+
+struct NonAddressDiscriminatedVTablePtrAttr RelocatableNonAddressDiscriminatedVTablePtr2 trivially_relocatable_if_eligible {
+  virtual ~RelocatableNonAddressDiscriminatedVTablePtr2();
+  void *j;
+};
+
+struct AddressDiscriminatedVTablePtrAttr AddressDiscriminatedVTablePtr {
+  virtual ~AddressDiscriminatedVTablePtr();
+  void *k;
+};
+
+struct AddressDiscriminatedVTablePtrAttr RelocatableAddressDiscriminatedVTablePtr trivially_relocatable_if_eligible {
+  virtual ~RelocatableAddressDiscriminatedVTablePtr();
+  void *k;
+};
+
+struct NoAddressDiscriminatedBaseClasses : NonAddressDiscriminatedVTablePtr,
+                                           NonAddressDiscriminatedVTablePtr2 {
+  void *l;
+};
+
+struct RelocatableNoAddressDiscriminatedBaseClasses trivially_relocatable_if_eligible :
+                                           NonAddressDiscriminatedVTablePtr,
+                                           NonAddressDiscriminatedVTablePtr2 {
+  void *l;
+};
+
+struct AddressDiscriminatedPrimaryBase : AddressDiscriminatedVTablePtr,
+                                         NonAddressDiscriminatedVTablePtr {
+  void *l;
+};
+struct AddressDiscriminatedSecondaryBase : NonAddressDiscriminatedVTablePtr,
+                                           AddressDiscriminatedVTablePtr {
+  void *l;
+};
+
+struct RelocatableAddressDiscriminatedPrimaryBase : RelocatableAddressDiscriminatedVTablePtr,
+                                         RelocatableNonAddressDiscriminatedVTablePtr {
+  void *l;
+};
+struct RelocatableAddressDiscriminatedSecondaryBase : RelocatableNonAddressDiscriminatedVTablePtr,
+                                           RelocatableAddressDiscriminatedVTablePtr {
+  void *l;
+};
+struct EmbdeddedAddressDiscriminatedPolymorphicClass {
+  AddressDiscriminatedVTablePtr field;
+};
+struct RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass trivially_relocatable_if_eligible {
+  AddressDiscriminatedVTablePtr field;
+};
+
+static_assert( __is_pod(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_pod(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __is_pod(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_pod(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert(!__is_pod(AddressDiscriminatedFieldInBaseClass));
+static_assert(!__is_pod(NonAddressDiscriminatedVTablePtr));
+static_assert(!__is_pod(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_pod(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__is_pod(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_pod(AddressDiscriminatedVTablePtr));
+static_assert(!__is_pod(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__is_pod(NoAddressDiscriminatedBaseClasses));
+static_assert(!__is_pod(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__is_pod(AddressDiscriminatedPrimaryBase));
+static_assert(!__is_pod(AddressDiscriminatedSecondaryBase));
+static_assert(!__is_pod(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__is_pod(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__is_pod(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__is_pod(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __is_standard_layout(AddressDiscriminatedPtr));
+static_assert( __is_standard_layout(AddressDiscriminatedInt64));
+static_assert( __is_standard_layout(AddressDiscriminatedFields));
+static_assert( __is_standard_layout(RelocatableAddressDiscriminatedFields));
+static_assert(!__is_standard_layout(AddressDiscriminatedFieldInBaseClass));
+static_assert(!__is_standard_layout(NonAddressDiscriminatedVTablePtr));
+static_assert(!__is_standard_layout(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_standard_layout(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__is_standard_layout(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_standard_layout(AddressDiscriminatedVTablePtr));
+static_assert(!__is_standard_layout(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__is_standard_layout(NoAddressDiscriminatedBaseClasses));
+static_assert(!__is_standard_layout(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__is_standard_layout(AddressDiscriminatedPrimaryBase));
+static_assert(!__is_standard_layout(AddressDiscriminatedSecondaryBase));
+static_assert(!__is_standard_layout(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__is_standard_layout(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__is_standard_layout(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__is_standard_layout(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __has_trivial_move_constructor(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_constructor(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_constructor(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_constructor(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_constructor(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__has_trivial_move_constructor(NonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_constructor(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_move_constructor(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_constructor(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_move_constructor(AddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_constructor(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_constructor(NoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_move_constructor(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_move_constructor(AddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_move_constructor(AddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_move_constructor(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_move_constructor(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_move_constructor(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__has_trivial_move_constructor(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __has_trivial_copy(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_copy(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_copy(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_copy(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_copy(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__has_trivial_copy(NonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_copy(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_copy(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_copy(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_copy(AddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_copy(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_copy(NoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_copy(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_copy(AddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_copy(AddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_copy(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_copy(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_copy(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__has_trivial_copy(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __has_trivial_assign(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_assign(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_assign(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_assign(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_assign(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__has_trivial_assign(NonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_assign(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_assign(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_assign(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_assign(AddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_assign(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_assign(NoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_assign(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_assign(AddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_assign(AddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_assign(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_assign(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_assign(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__has_trivial_assign(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __has_trivial_move_assign(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_assign(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_assign(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_assign(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_trivial_move_assign(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__has_trivial_move_assign(NonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_assign(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_move_assign(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_assign(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_trivial_move_assign(AddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_assign(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__has_trivial_move_assign(NoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_move_assign(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__has_trivial_move_assign(AddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_move_assign(AddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_move_assign(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__has_trivial_move_assign(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__has_trivial_move_assign(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__has_trivial_move_assign(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __is_trivial(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivial(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivial(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivial(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivial(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__is_trivial(NonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivial(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivial(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivial(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivial(AddressDiscriminatedVTablePtr));
+static_assert(!__is_trivial(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivial(NoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivial(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivial(AddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivial(AddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivial(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivial(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivial(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__is_trivial(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __is_trivially_copyable(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_copyable(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_copyable(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_copyable(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_copyable(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__is_trivially_copyable(NonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_copyable(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivially_copyable(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_copyable(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivially_copyable(AddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_copyable(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_copyable(NoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivially_copyable(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivially_copyable(AddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivially_copyable(AddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivially_copyable(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivially_copyable(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivially_copyable(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__is_trivially_copyable(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __is_trivially_equality_comparable(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_equality_comparable(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert(!__is_trivially_equality_comparable(AddressDiscriminatedFields));
+static_assert(!__is_trivially_equality_comparable(RelocatableAddressDiscriminatedFields));
+static_assert(!__is_trivially_equality_comparable(AddressDiscriminatedFieldInBaseClass));
+static_assert(!__is_trivially_equality_comparable(NonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_equality_comparable(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivially_equality_comparable(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_equality_comparable(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivially_equality_comparable(AddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_equality_comparable(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_equality_comparable(NoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivially_equality_comparable(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivially_equality_comparable(AddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivially_equality_comparable(AddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivially_equality_comparable(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivially_equality_comparable(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivially_equality_comparable(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__is_trivially_equality_comparable(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __is_trivially_relocatable(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_relocatable(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_relocatable(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_relocatable(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_trivially_relocatable(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__is_trivially_relocatable(NonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_relocatable(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivially_relocatable(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_relocatable(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__is_trivially_relocatable(AddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_relocatable(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__is_trivially_relocatable(NoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivially_relocatable(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__is_trivially_relocatable(AddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivially_relocatable(AddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivially_relocatable(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__is_trivially_relocatable(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__is_trivially_relocatable(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__is_trivially_relocatable(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __builtin_is_cpp_trivially_relocatable(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __builtin_is_cpp_trivially_relocatable(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __builtin_is_cpp_trivially_relocatable(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __builtin_is_cpp_trivially_relocatable(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __builtin_is_cpp_trivially_relocatable(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__builtin_is_cpp_trivially_relocatable(NonAddressDiscriminatedVTablePtr));
+static_assert(!__builtin_is_cpp_trivially_relocatable(NonAddressDiscriminatedVTablePtr2));
+static_assert( __builtin_is_cpp_trivially_relocatable(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert( __builtin_is_cpp_trivially_relocatable(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__builtin_is_cpp_trivially_relocatable(AddressDiscriminatedVTablePtr));
+static_assert( __builtin_is_cpp_trivially_relocatable(RelocatableAddressDiscriminatedVTablePtr) == !ADDR_DISC_ENABLED);
+static_assert(!__builtin_is_cpp_trivially_relocatable(NoAddressDiscriminatedBaseClasses));
+static_assert(!__builtin_is_cpp_trivially_relocatable(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__builtin_is_cpp_trivially_relocatable(AddressDiscriminatedPrimaryBase));
+static_assert(!__builtin_is_cpp_trivially_relocatable(AddressDiscriminatedSecondaryBase));
+static_assert( __builtin_is_cpp_trivially_relocatable(RelocatableAddressDiscriminatedPrimaryBase) == !ADDR_DISC_ENABLED);
+static_assert( __builtin_is_cpp_trivially_relocatable(RelocatableAddressDiscriminatedSecondaryBase) == !ADDR_DISC_ENABLED);
+static_assert(!__builtin_is_cpp_trivially_relocatable(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__builtin_is_cpp_trivially_relocatable(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __builtin_is_replaceable(AddressDiscriminatedPtr));
+static_assert( __builtin_is_replaceable(AddressDiscriminatedInt64));
+static_assert( __builtin_is_replaceable(AddressDiscriminatedFields));
+static_assert( __builtin_is_replaceable(RelocatableAddressDiscriminatedFields));
+static_assert( __builtin_is_replaceable(AddressDiscriminatedFieldInBaseClass));
+static_assert(!__builtin_is_replaceable(NonAddressDiscriminatedVTablePtr));
+static_assert(!__builtin_is_replaceable(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__builtin_is_replaceable(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__builtin_is_replaceable(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__builtin_is_replaceable(AddressDiscriminatedVTablePtr));
+static_assert(!__builtin_is_replaceable(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__builtin_is_replaceable(NoAddressDiscriminatedBaseClasses));
+static_assert(!__builtin_is_replaceable(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__builtin_is_replaceable(AddressDiscriminatedPrimaryBase));
+static_assert(!__builtin_is_replaceable(AddressDiscriminatedSecondaryBase));
+static_assert(!__builtin_is_replaceable(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__builtin_is_replaceable(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__builtin_is_replaceable(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__builtin_is_replaceable(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(NonAddressDiscriminatedVTablePtr));
+static_assert( __is_bitwise_cloneable(NonAddressDiscriminatedVTablePtr2));
+static_assert( __is_bitwise_cloneable(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert( __is_bitwise_cloneable(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedVTablePtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(RelocatableAddressDiscriminatedVTablePtr) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(NoAddressDiscriminatedBaseClasses));
+static_assert( __is_bitwise_cloneable(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedPrimaryBase) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(AddressDiscriminatedSecondaryBase) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(RelocatableAddressDiscriminatedPrimaryBase) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(RelocatableAddressDiscriminatedSecondaryBase) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(EmbdeddedAddressDiscriminatedPolymorphicClass) == !ADDR_DISC_ENABLED);
+static_assert( __is_bitwise_cloneable(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass) == !ADDR_DISC_ENABLED);
+
+static_assert( __has_unique_object_representations(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( __has_unique_object_representations(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( __has_unique_object_representations(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_unique_object_representations(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( __has_unique_object_representations(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!__has_unique_object_representations(NonAddressDiscriminatedVTablePtr));
+static_assert(!__has_unique_object_representations(NonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_unique_object_representations(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!__has_unique_object_representations(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!__has_unique_object_representations(AddressDiscriminatedVTablePtr));
+static_assert(!__has_unique_object_representations(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!__has_unique_object_representations(NoAddressDiscriminatedBaseClasses));
+static_assert(!__has_unique_object_representations(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!__has_unique_object_representations(AddressDiscriminatedPrimaryBase));
+static_assert(!__has_unique_object_representations(AddressDiscriminatedSecondaryBase));
+static_assert(!__has_unique_object_representations(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!__has_unique_object_representations(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!__has_unique_object_representations(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!__has_unique_object_representations(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
+
+#define ASSIGNABLE_WRAPPER(Type) __is_trivially_assignable(Type&, Type)
+static_assert( ASSIGNABLE_WRAPPER(AddressDiscriminatedPtr) == !ADDR_DISC_ENABLED);
+static_assert( ASSIGNABLE_WRAPPER(AddressDiscriminatedInt64) == !ADDR_DISC_ENABLED);
+static_assert( ASSIGNABLE_WRAPPER(AddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( ASSIGNABLE_WRAPPER(RelocatableAddressDiscriminatedFields) == !ADDR_DISC_ENABLED);
+static_assert( ASSIGNABLE_WRAPPER(AddressDiscriminatedFieldInBaseClass) == !ADDR_DISC_ENABLED);
+static_assert(!ASSIGNABLE_WRAPPER(NonAddressDiscriminatedVTablePtr));
+static_assert(!ASSIGNABLE_WRAPPER(NonAddressDiscriminatedVTablePtr2));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableNonAddressDiscriminatedVTablePtr));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableNonAddressDiscriminatedVTablePtr2));
+static_assert(!ASSIGNABLE_WRAPPER(AddressDiscriminatedVTablePtr));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableAddressDiscriminatedVTablePtr));
+static_assert(!ASSIGNABLE_WRAPPER(NoAddressDiscriminatedBaseClasses));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableNoAddressDiscriminatedBaseClasses));
+static_assert(!ASSIGNABLE_WRAPPER(AddressDiscriminatedPrimaryBase));
+static_assert(!ASSIGNABLE_WRAPPER(AddressDiscriminatedSecondaryBase));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableAddressDiscriminatedPrimaryBase));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableAddressDiscriminatedSecondaryBase));
+static_assert(!ASSIGNABLE_WRAPPER(EmbdeddedAddressDiscriminatedPolymorphicClass));
+static_assert(!ASSIGNABLE_WRAPPER(RelocatableEmbdeddedAddressDiscriminatedPolymorphicClass));
diff --git a/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
index b38499a634fc..4a907b8c1cb5 100644
--- a/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
+++ b/clang/test/SemaCXX/trivially-relocatable-ptrauth.cpp
@@ -57,7 +57,7 @@ struct Foo : Polymorphic {
 };
 
 
-static_assert(__builtin_is_cpp_trivially_relocatable(Polymorphic));
+static_assert(!__builtin_is_cpp_trivially_relocatable(Polymorphic));
 
 struct [[clang::ptrauth_vtable_pointer(process_independent,no_address_discrimination,no_extra_discrimination)]] NonAddressDiscriminatedPolymorphic trivially_relocatable_if_eligible {
   virtual ~NonAddressDiscriminatedPolymorphic();
@@ -70,7 +70,7 @@ struct PolymorphicMembers {
     Polymorphic field;
 };
 
-static_assert(__builtin_is_cpp_trivially_relocatable(PolymorphicMembers));
+static_assert(!__builtin_is_cpp_trivially_relocatable(PolymorphicMembers));
 
 struct UnionOfPolymorphic {
   union trivially_relocatable_if_eligible {
diff --git a/clang/test/SemaCXX/warn-thread-safety-negative.cpp b/clang/test/SemaCXX/warn-thread-safety-negative.cpp
index 9eabd67e4fc7..0caf6d6139e5 100644
--- a/clang/test/SemaCXX/warn-thread-safety-negative.cpp
+++ b/clang/test/SemaCXX/warn-thread-safety-negative.cpp
@@ -21,6 +21,15 @@ class LOCKABLE Mutex {
   void AssertReaderHeld() ASSERT_SHARED_LOCK();
 };
 
+class LOCKABLE REENTRANT_CAPABILITY ReentrantMutex {
+public:
+  void Lock() EXCLUSIVE_LOCK_FUNCTION();
+  void Unlock() UNLOCK_FUNCTION();
+
+  // for negative capabilities
+  const ReentrantMutex& operator!() const { return *this; }
+};
+
 class SCOPED_LOCKABLE MutexLock {
 public:
   MutexLock(Mutex *mu) EXCLUSIVE_LOCK_FUNCTION(mu);
@@ -89,6 +98,29 @@ class Foo {
   }
 };
 
+class Reentrant {
+  ReentrantMutex mu;
+
+public:
+  void acquire() {
+    mu.Lock();   // no warning -- reentrant mutex
+    mu.Unlock();
+  }
+
+  void requireNegative() EXCLUSIVE_LOCKS_REQUIRED(!mu) { // warning?
+    mu.Lock();
+    mu.Unlock();
+  }
+
+  void callRequireNegative() {
+    requireNegative(); // expected-warning{{calling function 'requireNegative' requires negative capability '!mu'}}
+  }
+
+  void callHaveNegative() EXCLUSIVE_LOCKS_REQUIRED(!mu) {
+    requireNegative();
+  }
+};
+
 }  // end namespace SimpleTest
 
 Mutex globalMutex;
diff --git a/clang/test/SemaCXX/wreturn-always-throws.cpp b/clang/test/SemaCXX/wreturn-always-throws.cpp
index addcadd1183d..df7689f7063c 100644
--- a/clang/test/SemaCXX/wreturn-always-throws.cpp
+++ b/clang/test/SemaCXX/wreturn-always-throws.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type -verify %s
+// RUN: %clang_cc1 -fsyntax-only -fcxx-exceptions -fexceptions -Wreturn-type -Winvalid-noreturn -verify %s
 // expected-no-diagnostics
 
 namespace std {
@@ -44,3 +44,22 @@ void testTemplates() {
   throwErrorTemplate("ERROR");
   (void)ensureZeroTemplate(42);
 }
+
+// Ensure that explicit specialization of a member function does not inherit
+// the warning from the primary template.
+
+template<typename T>
+struct S {
+  void f();
+  void g();
+};
+
+template<typename T>
+void S<T>::f() { throw 0; } 
+template<>
+void S<int>::f() {}
+
+template<typename T> 
+void S<T>::g() {}  
+template<> 
+void S<int>::g() { throw 0; }
diff --git a/clang/test/SemaObjC/ptrauth-qualifier.m b/clang/test/SemaObjC/ptrauth-qualifier.m
index 74bbe6f09899..67a73bbe4577 100644
--- a/clang/test/SemaObjC/ptrauth-qualifier.m
+++ b/clang/test/SemaObjC/ptrauth-qualifier.m
@@ -1,13 +1,25 @@
-// RUN: %clang_cc1 -triple arm64-apple-ios -fsyntax-only -verify -fptrauth-intrinsics %s
+// RUN: %clang_cc1 -triple arm64-apple-ios -DIS_DARWIN -fsyntax-only -verify -fptrauth-intrinsics %s
 // RUN: %clang_cc1 -triple aarch64-linux-gnu -fsyntax-only -verify -fptrauth-intrinsics %s
 
-#if !__has_extension(ptrauth_qualifier)
+#if defined(IS_DARWIN) && !__has_extension(ptrauth_qualifier)
 // This error means that the __ptrauth qualifier availability test says  that it
 // is not available. This error is not expected in the output, if it is seen
 // there is a feature detection regression.
 #error __ptrauth qualifier not enabled
 #endif
 
+#if defined(IS_DARWIN) && !__has_feature(ptrauth_qualifier)
+// This error means that the __has_feature test for ptrauth_qualifier has
+// failed, despite it being expected on darwin.
+#error __ptrauth qualifier not enabled
+#elif !defined(IS_DARWIN) && (__has_feature(ptrauth_qualifier) || __has_extension(ptrauth_qualifier))
+#error ptrauth_qualifier labeled a feature on a non-darwin platform
+#endif
+
+#if !defined (__PTRAUTH__)
+#error __PTRAUTH__ test macro not defined when ptrauth is enabled
+#endif
+
 @interface Foo
 // expected-warning@-1 {{class 'Foo' defined without specifying a base class}}
 // expected-note@-2 {{add a super class to fix this problem}}
diff --git a/clang/test/SemaObjCXX/discarded-block-type-inference.mm b/clang/test/SemaObjCXX/discarded-block-type-inference.mm
new file mode 100644
index 000000000000..8e2587724a7f
--- /dev/null
+++ b/clang/test/SemaObjCXX/discarded-block-type-inference.mm
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fobjc-arc -fblocks %s
+
+void  block_receiver(int (^)() );
+
+int f1() {
+  if constexpr (0)
+    (block_receiver)(^{ return 2; });
+  return 1;
+}
+
+int f2() {
+  if constexpr (0)
+    return (^{ return 2; })();
+  return 1;
+}
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 62a4f95d79c7..6d6052791507 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1228,25 +1228,25 @@ template <KnownKind T> struct KnownType {
 
 }
 
-namespace GH115838 {
+namespace CWG2369_Regression_2 {
 
-template<typename T> concept has_x = requires(T t) {{ t.x };};
-
-class Publ { public:    int x = 0; };
-class Priv { private:   int x = 0; };
-class Prot { protected: int x = 0; };
-class Same { protected: int x = 0; };
-
-template<typename T> class D;
-template<typename T> requires ( has_x<T>) class D<T>: public T { public: static constexpr bool has = 1; };
-template<typename T> requires (!has_x<T>) class D<T>: public T { public: static constexpr bool has = 0; };
+template <typename T>
+concept HasFastPropertyForAttribute =
+    requires(T element, int name) { element.propertyForAttribute(name); };
+
+template <typename OwnerType>
+struct SVGPropertyOwnerRegistry {
+  static int fastAnimatedPropertyLookup() {
+    static_assert (HasFastPropertyForAttribute<OwnerType>);
+    return 1;
+  }
+};
 
-// "Same" is identical to "Prot" but queried before used.
-static_assert(!has_x<Same>,  "Protected should be invisible.");
-static_assert(!D<Same>::has, "Protected should be invisible.");
+class SVGCircleElement {
+  friend SVGPropertyOwnerRegistry<SVGCircleElement>;
+  void propertyForAttribute(int);
+};
 
-static_assert( D<Publ>::has, "Public should be visible.");
-static_assert(!D<Priv>::has, "Private should be invisible.");
-static_assert(!D<Prot>::has, "Protected should be invisible.");
+int i = SVGPropertyOwnerRegistry<SVGCircleElement>::fastAnimatedPropertyLookup();
 
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 0953f647426f..f6bc6ee3673c 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -966,3 +966,19 @@ Expand<Type, Invocable<>> _{};
 // CHECK-NEXT:   | `-ParmVarDecl {{.+}} 'T...' pack
 
 }
+
+namespace GH134613 {
+template <typename R> struct Foo {
+  using value_type = R;
+
+  Foo() = default;
+  Foo(Foo<Foo<R>> &&rhs) {}
+};
+
+void main() {
+  auto r1 = Foo(Foo<Foo<int>>{});
+
+  static_assert(__is_same(decltype(r1)::value_type, int));
+}
+
+}
diff --git a/clang/test/bindings/python/bindings.sh b/clang/test/bindings/python/bindings.sh
deleted file mode 100755
index 3f7a51ef9ca4..000000000000
--- a/clang/test/bindings/python/bindings.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/sh
-
-# UNSUPPORTED: !libclang-loadable
-
-# Tests fail on Windows, and need someone knowledgeable to fix.
-# It's not clear whether it's a test or a valid binding problem.
-# XFAIL: target={{.*windows.*}}
-
-# The Python FFI interface is broken on AIX: https://bugs.python.org/issue38628.
-# XFAIL: target={{.*-aix.*}}
-
-# Hexagon has known test failures that need to be addressed.
-# https://reviews.llvm.org/D52840#1265716
-# XFAIL: target={{hexagon-.*}}
-# python SEGVs on Linux/sparc64 when loading libclang.so.  Seems to be an FFI
-# issue, too.
-# XFAIL: target={{sparc.*-.*-linux.*}}
-
-# Tests will fail if cross-compiling for a different target, as tests will try
-# to use the host Python3_EXECUTABLE and make FFI calls to functions in target
-# libraries.
-#
-# FIXME: Consider a solution that allows better control over these tests in
-# a crosscompiling scenario. e.g. registering them with lit to allow them to
-# be explicitly skipped via appropriate LIT_ARGS, or adding a mechanism to
-# allow specifying a python interpreter compiled for the target that could
-# be executed using qemu-user.
-# REQUIRES: native
-
-# SystemZ has broken Python/FFI interface
-# according to https://reviews.llvm.org/D52840#1265716
-# This leads to failures only when Clang is built with GCC apparently, see:
-# https://github.com/llvm/llvm-project/pull/146844#issuecomment-3048291798
-# REQUIRES: !target={{s390x-.*}}
-
-# RUN: env PYTHONPATH=%S/../../../bindings/python \
-# RUN:   CLANG_LIBRARY_PATH=%libdir \
-# RUN:   %python -m unittest discover -s %S/tests
diff --git a/clang/test/bindings/python/lit.local.cfg b/clang/test/bindings/python/lit.local.cfg
deleted file mode 100644
index cc3bdf8ba97d..000000000000
--- a/clang/test/bindings/python/lit.local.cfg
+++ /dev/null
@@ -1,41 +0,0 @@
-def is_libclang_loadable():
-    # Do not try to run if libclang was built with sanitizers because
-    # the sanitizer library will likely be loaded too late to perform
-    # interception and will then fail.
-    # We could use LD_PRELOAD/DYLD_INSERT_LIBRARIES but this isn't
-    # portable so its easier just to not run the tests when building
-    # with ASan.
-    if config.llvm_use_sanitizer != "":
-        return False
-    try:
-        sys.path.append(os.path.join(config.clang_src_dir, "bindings/python"))
-        from clang.cindex import Config
-        conf = Config()
-        Config.set_library_path(config.clang_lib_dir)
-        conf.lib
-        return True
-    except Exception as e:
-        # Expected failure modes are considered benign when nothing can be
-        # done about them.
-        #
-        # Cannot load a 32-bit libclang.so into a 64-bit python.
-        if "wrong ELF class: ELFCLASS32" in str(e):
-            return False
-        # If libclang.so is missing, it must have been disabled intentionally,
-        # e.g. by building with LLVM_ENABLE_PIC=OFF.
-        elif "No such file or directory" in str(e):
-            return False
-        # Unexpected failure modes need to be investigated to either fix an
-        # underlying bug or accept the failure, so return True.  This causes
-        # tests to run and FAIL, drawing developer attention.
-        else:
-            print("warning: unhandled failure in is_libclang_loadable: "
-                  + str(e), file=sys.stderr)
-            return True
-
-if is_libclang_loadable():
-    config.available_features.add("libclang-loadable")
-
-config.substitutions.append(('%libdir', config.clang_lib_dir))
-
-config.suffixes = ['.sh']
diff --git a/clang/tools/clang-format/git-clang-format b/clang/tools/clang-format/git-clang-format
index e709803d9a3f..fe2dd283d403 100755
--- a/clang/tools/clang-format/git-clang-format
+++ b/clang/tools/clang-format/git-clang-format
@@ -419,7 +419,7 @@ def compute_diff(commits, files, staged, diff_common_commit):
     if len(commits) == 2:
         git_tool = "diff-tree"
         if diff_common_commit:
-            commits = [f"{commits[0]}...{commits[1]}"]
+            extra_args += ["--merge-base"]
     elif staged:
         extra_args += ["--cached"]
 
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index b6662b66206b..2b1e266f0739 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -42,6 +42,7 @@ set(SOURCES
   Indexing.cpp
   FatalErrorHandler.cpp
   Rewrite.cpp
+  Obsolete.cpp
 
   ADDITIONAL_HEADERS
   CIndexDiagnostic.h
diff --git a/clang/tools/libclang/Obsolete.cpp b/clang/tools/libclang/Obsolete.cpp
new file mode 100644
index 000000000000..3596f76e1be6
--- /dev/null
+++ b/clang/tools/libclang/Obsolete.cpp
@@ -0,0 +1,48 @@
+//===- Obsolete.cpp - Obsolete libclang functions and types -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===--------------------------------------------------------------------===//
+//
+// This file contains libclang symbols whose underlying functionality has been
+// removed from Clang, but which need to be kept around so as to retain ABI
+// compatibility.
+//
+//===--------------------------------------------------------------------===//
+
+#include "clang-c/CXString.h"
+#include "clang-c/Index.h"
+#include "clang-c/Platform.h"
+#include "llvm/Support/raw_ostream.h"
+
+extern "C" {
+
+// The functions below used to be part of the C API for ARCMigrate, which has
+// since been removed from Clang; they already used to print an error if Clang
+// was compiled without arcmt support, so we continue doing so.
+CXRemapping clang_getRemappings(const char *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+CXRemapping clang_getRemappingsFromFileList(const char **, unsigned) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return nullptr;
+}
+
+unsigned clang_remap_getNumFiles(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+  return 0;
+}
+
+void clang_remap_getFilenames(CXRemapping, unsigned, CXString *, CXString *) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+void clang_remap_dispose(CXRemapping) {
+  llvm::errs() << "error: ARCMigrate has been removed from Clang";
+}
+
+} // extern "C"
diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index d140a71e771a..3d9d2e268a61 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -327,6 +327,8 @@ LLVM_13 {
     clang_getRange;
     clang_getRangeEnd;
     clang_getRangeStart;
+    clang_getRemappings;
+    clang_getRemappingsFromFileList;
     clang_getResultType;
     clang_getSkippedRanges;
     clang_getSpecializedCursorTemplate;
@@ -387,6 +389,9 @@ LLVM_13 {
     clang_parseTranslationUnit;
     clang_parseTranslationUnit2;
     clang_parseTranslationUnit2FullArgv;
+    clang_remap_dispose;
+    clang_remap_getFilenames;
+    clang_remap_getNumFiles;
     clang_reparseTranslationUnit;
     clang_saveTranslationUnit;
     clang_sortCodeCompletionResults;
@@ -435,12 +440,12 @@ LLVM_20 {
     clang_getTypePrettyPrinted;
     clang_isBeforeInTranslationUnit;
     clang_visitCXXBaseClasses;
-    clang_visitCXXMethods;
 };
 
 LLVM_21 {
   global:
     clang_getFullyQualifiedName;
+    clang_visitCXXMethods;
     clang_Cursor_getGCCAssemblyTemplate;
     clang_Cursor_isGCCAssemblyHasGoto;
     clang_Cursor_getGCCAssemblyNumOutputs;
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index d17109aebc0f..2b17c36f6aa8 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -249,6 +249,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions,
                           AfterFunctionDefinitionName);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterIfMacros);
+  CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterNot);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterOverloadedOperator);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, AfterPlacementOperator);
   CHECK_PARSE_NESTED_BOOL(SpaceBeforeParensOptions, BeforeNonEmptyParentheses);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 0bc1c6d45656..95682f2d8cfd 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -4050,6 +4050,10 @@ TEST_F(FormatTest, FormatsBitfields) {
                "  uchar : 8;\n"
                "  uchar other;\n"
                "};");
+  verifyFormat("struct foo {\n"
+               "  uint8_t i_am_a_bit_field_this_long\n"
+               "      : struct_with_constexpr::i_am_a_constexpr_lengthhhhh;\n"
+               "};");
   FormatStyle Style = getLLVMStyle();
   Style.BitFieldColonSpacing = FormatStyle::BFCS_None;
   verifyFormat("struct Bitfields {\n"
@@ -8571,10 +8575,10 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
                "operator<<(const SomeLooooooooooooooooooooooooogType &other);");
   verifyGoogleFormat(
       "SomeLoooooooooooooooooooooooooooooogType operator>>(\n"
-      "    const SomeLooooooooogType &a, const SomeLooooooooogType &b);");
+      "    const SomeLooooooooogType& a, const SomeLooooooooogType& b);");
   verifyGoogleFormat(
       "SomeLoooooooooooooooooooooooooooooogType operator<<(\n"
-      "    const SomeLooooooooogType &a, const SomeLooooooooogType &b);");
+      "    const SomeLooooooooogType& a, const SomeLooooooooogType& b);");
 
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa = 1);");
@@ -8583,7 +8587,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyGoogleFormat(
       "typename aaaaaaaaaa<aaaaaa>::aaaaaaaaaaa\n"
       "aaaaaaaaaa<aaaaaa>::aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
-      "    bool *aaaaaaaaaaaaaaaaaa, bool *aa) {}");
+      "    bool* aaaaaaaaaaaaaaaaaa, bool* aa) {}");
   verifyGoogleFormat("template <typename T>\n"
                      "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\n"
                      "aaaaaaaaaaaaaaaaaaaaaaa<T>::aaaaaaaaaaaaa(\n"
@@ -8592,7 +8596,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyFormat("extern \"C\" //\n"
                "    void f();");
 
-  FormatStyle Style = getLLVMStyle();
+  auto Style = getLLVMStyle();
   Style.PointerAlignment = FormatStyle::PAS_Left;
   verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaa* const aaaaaaaaaaaa) {}",
@@ -8600,6 +8604,14 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) {
   verifyFormat("void aaaaaaa(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa*\n"
                "                 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {}",
                Style);
+
+  Style = getLLVMStyleWithColumns(45);
+  Style.PenaltyReturnTypeOnItsOwnLine = 400;
+  verifyFormat("template <bool abool, // a comment\n"
+               "          bool anotherbool>\n"
+               "static inline std::pair<size_t, MyCustomType>\n"
+               "myfunc(const char *buf, const char *&err);",
+               Style);
 }
 
 TEST_F(FormatTest, DontBreakBeforeQualifiedOperator) {
@@ -12891,27 +12903,31 @@ TEST_F(FormatTest, UnderstandsEllipsis) {
 }
 
 TEST_F(FormatTest, AdaptivelyFormatsPointersAndReferences) {
+  auto Style = getGoogleStyle();
+  EXPECT_FALSE(Style.DerivePointerAlignment);
+  Style.DerivePointerAlignment = true;
+
   verifyFormat("int *a;\n"
                "int *a;\n"
                "int *a;",
                "int *a;\n"
                "int* a;\n"
                "int *a;",
-               getGoogleStyle());
+               Style);
   verifyFormat("int* a;\n"
                "int* a;\n"
                "int* a;",
                "int* a;\n"
                "int* a;\n"
                "int *a;",
-               getGoogleStyle());
+               Style);
   verifyFormat("int *a;\n"
                "int *a;\n"
                "int *a;",
                "int *a;\n"
                "int * a;\n"
                "int *  a;",
-               getGoogleStyle());
+               Style);
   verifyFormat("auto x = [] {\n"
                "  int *a;\n"
                "  int *a;\n"
@@ -12920,7 +12936,7 @@ TEST_F(FormatTest, AdaptivelyFormatsPointersAndReferences) {
                "auto x=[]{int *a;\n"
                "int * a;\n"
                "int *  a;};",
-               getGoogleStyle());
+               Style);
 }
 
 TEST_F(FormatTest, UnderstandsRvalueReferences) {
@@ -13056,7 +13072,7 @@ TEST_F(FormatTest, FormatsCasts) {
   verifyFormat("virtual void foo(char &) const;");
   verifyFormat("virtual void foo(int *a, char *) const;");
   verifyFormat("int a = sizeof(int *) + b;");
-  verifyGoogleFormat("int a = alignof(int *) + b;");
+  verifyGoogleFormat("int a = alignof(int*) + b;");
   verifyFormat("bool b = f(g<int>) && c;");
   verifyFormat("typedef void (*f)(int i) func;");
   verifyFormat("void operator++(int) noexcept;");
@@ -17652,6 +17668,12 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeParens) {
   verifyFormat("int x = int (y);", SomeSpace2);
   verifyFormat("auto lambda = []() { return 0; };", SomeSpace2);
 
+  auto Style = getLLVMStyle();
+  Style.SpaceBeforeParens = FormatStyle::SBPO_Custom;
+  EXPECT_FALSE(Style.SpaceBeforeParensOptions.AfterNot);
+  Style.SpaceBeforeParensOptions.AfterNot = true;
+  verifyFormat("return not (a || b);", Style);
+
   FormatStyle SpaceAfterOverloadedOperator = getLLVMStyle();
   SpaceAfterOverloadedOperator.SpaceBeforeParens = FormatStyle::SBPO_Custom;
   SpaceAfterOverloadedOperator.SpaceBeforeParensOptions
@@ -25419,7 +25441,7 @@ TEST_F(FormatTest, AtomicQualifier) {
   verifyFormat("struct foo {\n"
                "  int a1;\n"
                "  _Atomic(a) a2;\n"
-               "  _Atomic(_Atomic(int) *const) a3;\n"
+               "  _Atomic(_Atomic(int)* const) a3;\n"
                "};",
                Google);
   verifyFormat("_Atomic(uint64_t) a;");
diff --git a/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp b/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
index b1e42e924e05..67b9cc903790 100644
--- a/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
+++ b/clang/unittests/Format/IntegerLiteralSeparatorTest.cpp
@@ -83,6 +83,9 @@ TEST_F(IntegerLiteralSeparatorTest, SingleQuoteAsSeparator) {
                "d = 5678_km;\n"
                "h = 0xDEF_u16;",
                Style);
+
+  Style.Standard = FormatStyle::LS_Cpp11;
+  verifyFormat("ld = 1234L;", Style);
 }
 
 TEST_F(IntegerLiteralSeparatorTest, UnderscoreAsSeparator) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index e281a4945a86..af94841c820a 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -390,6 +390,10 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[20], tok::l_brace, TT_CompoundRequirementLBrace);
   EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 
+  Tokens = annotate("bool foo = requires { static_cast<Foo &&>(1); };");
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
+  EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_PointerOrReference);
+
   Tokens = annotate("return s.operator int *();");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   // Not TT_FunctionDeclarationName.
@@ -614,6 +618,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[20], tok::r_brace, TT_StructRBrace);
 
+  Tokens = annotate("class Outer {\n"
+                    "  struct Inner final : Base {};\n"
+                    "};");
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_Unknown); // Not TT_StartOfName
+  EXPECT_TOKEN(Tokens[6], tok::colon, TT_InheritanceColon);
+
   constexpr StringRef Code{"struct EXPORT StructName {};"};
 
   Tokens = annotate(Code);
diff --git a/clang/unittests/Lex/CMakeLists.txt b/clang/unittests/Lex/CMakeLists.txt
index 96ca6dda9cd8..fa5e58f5a893 100644
--- a/clang/unittests/Lex/CMakeLists.txt
+++ b/clang/unittests/Lex/CMakeLists.txt
@@ -5,6 +5,7 @@ add_clang_unittest(LexTests
   LexerTest.cpp
   LexHLSLRootSignatureTest.cpp
   ModuleDeclStateTest.cpp
+  NoTrivialPPDirectiveTracerTest.cpp
   PPCallbacksTest.cpp
   PPConditionalDirectiveRecordTest.cpp
   PPDependencyDirectivesTest.cpp
diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp
index 86df872f6b7d..bb6404c43448 100644
--- a/clang/unittests/Lex/LexerTest.cpp
+++ b/clang/unittests/Lex/LexerTest.cpp
@@ -796,7 +796,7 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
                                     PP->getSourceManager(), PP->getLangOpts(),
                                     /*IgnoreWhiteSpace=*/false));
-    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc() == Tok.getLocation());
     EXPECT_TRUE(Tok.is(tok::hash));
   }
 
@@ -812,7 +812,7 @@ TEST_F(LexerTest, CheckFirstPPToken) {
     EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok,
                                     PP->getSourceManager(), PP->getLangOpts(),
                                     /*IgnoreWhiteSpace=*/false));
-    EXPECT_TRUE(Tok.isFirstPPToken());
+    EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc() == Tok.getLocation());
     EXPECT_TRUE(Tok.is(tok::raw_identifier));
     EXPECT_TRUE(Tok.getRawIdentifier() == "FOO");
   }
diff --git a/clang/unittests/Lex/ModuleDeclStateTest.cpp b/clang/unittests/Lex/ModuleDeclStateTest.cpp
index 6ecba4de3187..0c03cfd6d0f8 100644
--- a/clang/unittests/Lex/ModuleDeclStateTest.cpp
+++ b/clang/unittests/Lex/ModuleDeclStateTest.cpp
@@ -61,14 +61,15 @@ class ModuleDeclStateTest : public ::testing::Test {
     Target = TargetInfo::CreateTargetInfo(Diags, *TargetOpts);
   }
 
-  std::unique_ptr<Preprocessor>
-  getPreprocessor(const char *source, Language Lang) {
+  std::unique_ptr<Preprocessor> getPreprocessor(const char *source,
+                                                Language Lang) {
     std::unique_ptr<llvm::MemoryBuffer> Buf =
         llvm::MemoryBuffer::getMemBuffer(source);
     SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
 
     std::vector<std::string> Includes;
-    LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes, LangStandard::lang_cxx20);
+    LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes,
+                                 LangStandard::lang_cxx20);
     LangOpts.CPlusPlusModules = true;
     if (Lang != Language::CXX) {
       LangOpts.Modules = true;
@@ -113,12 +114,11 @@ export module foo;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -132,12 +132,11 @@ module foo;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_TRUE(PP->isInImplementationUnit());
@@ -151,12 +150,11 @@ module foo:part;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -170,12 +168,11 @@ export module foo:part;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -189,12 +186,11 @@ export module foo.dot:part.dot;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -208,12 +204,11 @@ TEST_F(ModuleDeclStateTest, NotModule) {
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -234,12 +229,11 @@ import :another;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {true, true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)2);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)2);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -261,12 +255,11 @@ import :another;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {true, true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)2);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)2);
   EXPECT_TRUE(PP->isInNamedModule());
   EXPECT_TRUE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -287,12 +280,11 @@ import :another;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
 
   std::initializer_list<bool> ImportKinds = {true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)1);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)1);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -305,12 +297,11 @@ TEST_F(ModuleDeclStateTest, ImportAClangNamedModule) {
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::ObjCXX);
 
   std::initializer_list<bool> ImportKinds = {false};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)1);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)1);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
@@ -327,12 +318,11 @@ import M2;
   std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::ObjCXX);
 
   std::initializer_list<bool> ImportKinds = {false, true, false, true};
-  preprocess(*PP,
-             std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds));
-
-  auto *Callback =
-      static_cast<CheckNamedModuleImportingCB *>(PP->getPPCallbacks());
-  EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)4);
+  auto Callback =
+      std::make_unique<CheckNamedModuleImportingCB>(*PP, ImportKinds);
+  CheckNamedModuleImportingCB *CallbackPtr = Callback.get();
+  preprocess(*PP, std::move(Callback));
+  EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)4);
   EXPECT_FALSE(PP->isInNamedModule());
   EXPECT_FALSE(PP->isInNamedInterfaceUnit());
   EXPECT_FALSE(PP->isInImplementationUnit());
diff --git a/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp b/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp
new file mode 100644
index 000000000000..8b546fe2a40e
--- /dev/null
+++ b/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp
@@ -0,0 +1,183 @@
+//===- unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/FileManager.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/HeaderSearchOptions.h"
+#include "clang/Lex/ModuleLoader.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "gtest/gtest.h"
+#include <cstddef>
+#include <initializer_list>
+
+using namespace clang;
+
+namespace {
+class NoTrivialPPDirectiveTracerTest : public ::testing::Test {
+protected:
+  NoTrivialPPDirectiveTracerTest()
+      : VFS(llvm::makeIntrusiveRefCnt<llvm::vfs::InMemoryFileSystem>()),
+        FileMgr(FileMgrOpts, VFS), DiagID(new DiagnosticIDs()),
+        Diags(DiagID, DiagOpts, new IgnoringDiagConsumer()),
+        SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions) {
+    TargetOpts->Triple = "x86_64-unknown-linux-gnu";
+    Target = TargetInfo::CreateTargetInfo(Diags, *TargetOpts);
+  }
+
+  void addFile(const char *source, StringRef Filename) {
+    VFS->addFile(Filename, 0, llvm::MemoryBuffer::getMemBuffer(source),
+                 /*User=*/std::nullopt,
+                 /*Group=*/std::nullopt,
+                 llvm::sys::fs::file_type::regular_file);
+  }
+
+  std::unique_ptr<Preprocessor> getPreprocessor(const char *source,
+                                                Language Lang) {
+    std::unique_ptr<llvm::MemoryBuffer> Buf =
+        llvm::MemoryBuffer::getMemBuffer(source);
+    SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf)));
+
+    std::vector<std::string> Includes;
+    LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes,
+                                 LangStandard::lang_cxx20);
+    LangOpts.CPlusPlusModules = true;
+    if (Lang != Language::CXX) {
+      LangOpts.Modules = true;
+      LangOpts.ImplicitModules = true;
+    }
+
+    HeaderInfo.emplace(HSOpts, SourceMgr, Diags, LangOpts, Target.get());
+
+    auto DE = FileMgr.getOptionalDirectoryRef(".");
+    assert(DE);
+    auto DL = DirectoryLookup(*DE, SrcMgr::C_User, /*isFramework=*/false);
+    HeaderInfo->AddSearchPath(DL, /*isAngled=*/false);
+
+    return std::make_unique<Preprocessor>(PPOpts, Diags, LangOpts, SourceMgr,
+                                          *HeaderInfo, ModLoader,
+                                          /*IILookup=*/nullptr,
+                                          /*OwnsHeaderSearch=*/false);
+  }
+
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> VFS;
+  FileSystemOptions FileMgrOpts;
+  FileManager FileMgr;
+  IntrusiveRefCntPtr<DiagnosticIDs> DiagID;
+  DiagnosticOptions DiagOpts;
+  DiagnosticsEngine Diags;
+  SourceManager SourceMgr;
+  std::shared_ptr<TargetOptions> TargetOpts;
+  IntrusiveRefCntPtr<TargetInfo> Target;
+  LangOptions LangOpts;
+  TrivialModuleLoader ModLoader;
+  HeaderSearchOptions HSOpts;
+  std::optional<HeaderSearch> HeaderInfo;
+  PreprocessorOptions PPOpts;
+};
+
+TEST_F(NoTrivialPPDirectiveTracerTest, TrivialDirective) {
+  const char *source = R"(
+    #line 7
+    # 1 __FILE__ 1 3
+    #ident "$Header:$"
+    #pragma comment(lib, "msvcrt.lib")
+    #pragma mark LLVM's world
+    #pragma detect_mismatch("test", "1")
+    #pragma clang __debug dump Test
+    #pragma message "test"
+    #pragma GCC warning "Foo"
+    #pragma GCC error "Foo"
+    #pragma gcc diagnostic push
+    #pragma gcc diagnostic pop
+    #pragma GCC diagnostic ignored "-Wframe-larger-than"
+    #pragma OPENCL EXTENSION __cl_clang_variadic_functions : enable
+    #pragma warning(push)
+    #pragma warning(pop)
+    #pragma execution_character_set(push, "UTF-8")
+    #pragma execution_character_set(pop)
+    #pragma clang assume_nonnull begin
+    #pragma clang assume_nonnull end
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_FALSE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, IncludeDirective) {
+  const char *source = R"(
+    #include "header.h"
+    int foo;
+  )";
+  const char *header = R"(
+    #ifndef HEADER_H
+    #define HEADER_H
+    #endif // HEADER_H
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  addFile(header, "header.h");
+  PP->Initialize(*Target);
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, DefineDirective) {
+  const char *source = R"(
+    #define FOO
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, UnDefineDirective) {
+  const char *source = R"(
+    #undef FOO
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->setPredefines("#define FOO");
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+TEST_F(NoTrivialPPDirectiveTracerTest, IfDefinedDirective) {
+  const char *source = R"(
+    #if defined(FOO)
+    #endif
+    int foo;
+  )";
+  std::unique_ptr<Preprocessor> PP = getPreprocessor(source, Language::CXX);
+  PP->Initialize(*Target);
+  PP->setPredefines("#define FOO");
+  PP->EnterMainSourceFile();
+  Token Tok;
+  PP->Lex(Tok);
+  EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective());
+}
+
+} // namespace
diff --git a/cmake/Modules/CMakePolicy.cmake b/cmake/Modules/CMakePolicy.cmake
index 421f5a4dda6c..cf986331707b 100644
--- a/cmake/Modules/CMakePolicy.cmake
+++ b/cmake/Modules/CMakePolicy.cmake
@@ -41,3 +41,9 @@ if(POLICY CMP0156)
     cmake_policy(SET CMP0179 NEW)
   endif()
 endif()
+
+# CMP0182: Create shared library archives by default on AIX.
+# New in CMake 4.0: https://cmake.org/cmake/help/latest/policy/CMP0182.html
+if(POLICY CMP0182)
+  cmake_policy(SET CMP0182 NEW)
+endif()
diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index f14aae172f07..ef28fe845ddb 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -4,12 +4,12 @@ if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 21)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
-  set(LLVM_VERSION_PATCH 0)
+  set(LLVM_VERSION_PATCH 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_SUFFIX)
-  set(LLVM_VERSION_SUFFIX git)
+  set(LLVM_VERSION_SUFFIX)
 endif()
 
diff --git a/compiler-rt/lib/builtins/aarch64/lse.S b/compiler-rt/lib/builtins/aarch64/lse.S
index d7c1db7243ef..a444d82892c3 100644
--- a/compiler-rt/lib/builtins/aarch64/lse.S
+++ b/compiler-rt/lib/builtins/aarch64/lse.S
@@ -264,7 +264,7 @@ END_COMPILERRT_OUTLINE_FUNCTION(NAME(LDNM))
 
 NO_EXEC_STACK_DIRECTIVE
 
-// GNU property note for BTI and PAC
-GNU_PROPERTY_BTI_PAC
+// GNU property note for BTI, PAC, and GCS
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif // defined(__aarch64__) || defined(__arm64ec__)
diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index 7c47336cfc57..d5510ac0cfa5 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -371,5 +371,5 @@ END_COMPILERRT_FUNCTION(__arm_sme_restore)
 
 NO_EXEC_STACK_DIRECTIVE
 
-// GNU property note for BTI and PAC
-GNU_PROPERTY_BTI_PAC
+// GNU property note for BTI, PAC, and GCS
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index 89372f18c84b..d7db7d818945 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -79,11 +79,12 @@
 #define FUNC_ALIGN
 #endif
 
-// BTI and PAC gnu property note
+// BTI, PAC, and GCS gnu property note
 #define NT_GNU_PROPERTY_TYPE_0 5
 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI 1
 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC 2
+#define GNU_PROPERTY_AARCH64_FEATURE_1_GCS 4
 
 #if defined(__ARM_FEATURE_BTI_DEFAULT)
 #define BTI_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_BTI
@@ -97,6 +98,12 @@
 #define PAC_FLAG 0
 #endif
 
+#if defined(__ARM_FEATURE_GCS_DEFAULT)
+#define GCS_FLAG GNU_PROPERTY_AARCH64_FEATURE_1_GCS
+#else
+#define GCS_FLAG 0
+#endif
+
 #define GNU_PROPERTY(type, value)                                              \
   .pushsection .note.gnu.property, "a" SEPARATOR                               \
   .p2align 3 SEPARATOR                                                         \
@@ -118,11 +125,12 @@
 #define BTI_J
 #endif
 
-#if (BTI_FLAG | PAC_FLAG) != 0
-#define GNU_PROPERTY_BTI_PAC                                                   \
-  GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND, BTI_FLAG | PAC_FLAG)
+#if (BTI_FLAG | PAC_FLAG | GCS_FLAG) != 0
+#define GNU_PROPERTY_BTI_PAC_GCS                                               \
+  GNU_PROPERTY(GNU_PROPERTY_AARCH64_FEATURE_1_AND,                             \
+               BTI_FLAG | PAC_FLAG | GCS_FLAG)
 #else
-#define GNU_PROPERTY_BTI_PAC
+#define GNU_PROPERTY_BTI_PAC_GCS
 #endif
 
 #if defined(__clang__) || defined(__GCC_HAVE_DWARF2_CFI_ASM)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index be002dd71992..d7880529ebe7 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -34,12 +34,12 @@ typedef struct __ifunc_arg_t {
 _Bool __aarch64_have_lse_atomics
     __attribute__((visibility("hidden"), nocommon)) = false;
 
-#if defined(__FreeBSD__)
+#if defined(__FreeBSD__) || defined(__OpenBSD__)
 // clang-format off: should not reorder sys/auxv.h alphabetically
 #include <sys/auxv.h>
 // clang-format on
 #include "aarch64/hwcap.inc"
-#include "aarch64/lse_atomics/freebsd.inc"
+#include "aarch64/lse_atomics/elf_aux_info.inc"
 #elif defined(__Fuchsia__)
 #include "aarch64/hwcap.inc"
 #include "aarch64/lse_atomics/fuchsia.inc"
@@ -68,9 +68,9 @@ struct {
 // clang-format off
 #if defined(__APPLE__)
 #include "aarch64/fmv/apple.inc"
-#elif defined(__FreeBSD__)
+#elif defined(__FreeBSD__) || defined(__OpenBSD__)
 #include "aarch64/fmv/mrs.inc"
-#include "aarch64/fmv/freebsd.inc"
+#include "aarch64/fmv/elf_aux_info.inc"
 #elif defined(__Fuchsia__)
 #include "aarch64/fmv/fuchsia.inc"
 #elif defined(__ANDROID__)
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/elf_aux_info.inc
similarity index 100%
rename from compiler-rt/lib/builtins/cpu_model/aarch64/fmv/freebsd.inc
rename to compiler-rt/lib/builtins/cpu_model/aarch64/fmv/elf_aux_info.inc
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/elf_aux_info.inc
similarity index 100%
rename from compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/freebsd.inc
rename to compiler-rt/lib/builtins/cpu_model/aarch64/lse_atomics/elf_aux_info.inc
diff --git a/compiler-rt/lib/builtins/crtbegin.c b/compiler-rt/lib/builtins/crtbegin.c
index d5f7756308b0..447474bd0b69 100644
--- a/compiler-rt/lib/builtins/crtbegin.c
+++ b/compiler-rt/lib/builtins/crtbegin.c
@@ -54,22 +54,33 @@ static void __attribute__((used)) __do_init(void) {
 }
 
 #ifdef CRT_HAS_INITFINI_ARRAY
-#if __has_feature(ptrauth_init_fini)
+# if __has_feature(ptrauth_init_fini)
 // TODO: use __ptrauth-qualified pointers when they are supported on clang side
-#if __has_feature(ptrauth_init_fini_address_discrimination)
+#  if __has_feature(ptrauth_init_fini_address_discrimination)
 __attribute__((section(".init_array"), used)) static void *__init =
     ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer,
                           ptrauth_blend_discriminator(
                               &__init, __ptrauth_init_fini_discriminator));
-#else
+#  else
 __attribute__((section(".init_array"), used)) static void *__init =
     ptrauth_sign_constant(&__do_init, ptrauth_key_init_fini_pointer,
                           __ptrauth_init_fini_discriminator);
-#endif
-#else
+#  endif
+# elif __has_feature(ptrauth_calls)
+#  ifdef __aarch64__
+// If ptrauth_init_fini feature is not present, compiler emits raw unsigned
+// pointers in .init_array. Use inline assembly to avoid implicit signing of
+// __do_init function pointer with ptrauth_calls enabled.
+__asm__(".pushsection .init_array,\"aw\",@init_array\n\t"
+        ".xword __do_init\n\t"
+        ".popsection");
+#  else
+#   error "ptrauth_calls is only supported for AArch64"
+#  endif
+# else
 __attribute__((section(".init_array"),
                used)) static void (*__init)(void) = __do_init;
-#endif
+# endif
 #elif defined(__i386__) || defined(__x86_64__)
 __asm__(".pushsection .init,\"ax\",@progbits\n\t"
         "call __do_init\n\t"
@@ -125,22 +136,33 @@ static void __attribute__((used)) __do_fini(void) {
 }
 
 #ifdef CRT_HAS_INITFINI_ARRAY
-#if __has_feature(ptrauth_init_fini)
+# if __has_feature(ptrauth_init_fini)
 // TODO: use __ptrauth-qualified pointers when they are supported on clang side
-#if __has_feature(ptrauth_init_fini_address_discrimination)
+#  if __has_feature(ptrauth_init_fini_address_discrimination)
 __attribute__((section(".fini_array"), used)) static void *__fini =
     ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer,
                           ptrauth_blend_discriminator(
                               &__fini, __ptrauth_init_fini_discriminator));
-#else
+#  else
 __attribute__((section(".fini_array"), used)) static void *__fini =
     ptrauth_sign_constant(&__do_fini, ptrauth_key_init_fini_pointer,
                           __ptrauth_init_fini_discriminator);
-#endif
-#else
+#  endif
+# elif __has_feature(ptrauth_calls)
+#  ifdef __aarch64__
+// If ptrauth_init_fini feature is not present, compiler emits raw unsigned
+// pointers in .fini_array. Use inline assembly to avoid implicit signing of
+// __do_fini function pointer with ptrauth_calls enabled.
+__asm__(".pushsection .fini_array,\"aw\",@fini_array\n\t"
+        ".xword __do_fini\n\t"
+        ".popsection");
+#  else
+#   error "ptrauth_calls is only supported for AArch64"
+#  endif
+# else
 __attribute__((section(".fini_array"),
                used)) static void (*__fini)(void) = __do_fini;
-#endif
+# endif
 #elif defined(__i386__) || defined(__x86_64__)
 __asm__(".pushsection .fini,\"ax\",@progbits\n\t"
         "call __do_fini\n\t"
diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S b/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
index fd20825e3dac..825f41156509 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors_vfork.S
@@ -11,4 +11,4 @@
 
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
index 0c0abb6de861..b8d98b09ada2 100644
--- a/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_setjmp_aarch64.S
@@ -99,4 +99,4 @@ ASM_TRAMPOLINE_ALIAS(_setjmp, setjmp)
 // We do not need executable stack.
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
index fd060c51cd8e..be82475101c8 100644
--- a/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
+++ b/compiler-rt/lib/hwasan/hwasan_tag_mismatch_aarch64.S
@@ -157,4 +157,4 @@ mismatch:
 // We do not need executable stack.
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
index 08c2be47f535..673f284b6a04 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc
@@ -344,12 +344,16 @@ static void ioctl_table_fill() {
   _(SOUND_PCM_WRITE_CHANNELS, WRITE, sizeof(int));
   _(SOUND_PCM_WRITE_FILTER, WRITE, sizeof(int));
   _(TCFLSH, NONE, 0);
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   _(TCGETS, WRITE, struct_termios_sz);
+#    endif
   _(TCSBRK, NONE, 0);
   _(TCSBRKP, NONE, 0);
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   _(TCSETS, READ, struct_termios_sz);
   _(TCSETSF, READ, struct_termios_sz);
   _(TCSETSW, READ, struct_termios_sz);
+#    endif
   _(TCXONC, NONE, 0);
   _(TIOCGLCKTRMIOS, WRITE, struct_termios_sz);
   _(TIOCGSOFTCAR, WRITE, sizeof(int));
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
index cdfa6f1d7f53..5066953980af 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_vfork_aarch64.inc.S
@@ -43,6 +43,6 @@ ASM_SIZE(vfork)
 ASM_INTERCEPTOR_TRAMPOLINE(vfork)
 ASM_TRAMPOLINE_ALIAS(vfork, vfork)
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index f5cb85bc1bf3..530ff90c4cd1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -29,6 +29,7 @@
 #  include "sanitizer_solaris.h"
 
 #  if SANITIZER_HAIKU
+#    define _GNU_SOURCE
 #    define _DEFAULT_SOURCE
 #  endif
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
index 196c0a988478..13099fe84b0a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform.h
@@ -482,4 +482,19 @@
 #  define SANITIZER_START_BACKGROUND_THREAD_IN_ASAN_INTERNAL 0
 #endif
 
+#if SANITIZER_LINUX
+#  if SANITIZER_GLIBC
+// Workaround for
+// glibc/commit/3d3572f59059e2b19b8541ea648a6172136ec42e
+// Linux: Keep termios ioctl constants strictly internal
+#    if __GLIBC_PREREQ(2, 41)
+#      define SANITIZER_TERMIOS_IOCTL_CONSTANTS 0
+#    else
+#      define SANITIZER_TERMIOS_IOCTL_CONSTANTS 1
+#    endif
+#  else
+#    define SANITIZER_TERMIOS_IOCTL_CONSTANTS 1
+#  endif
+#endif
+
 #endif  // SANITIZER_PLATFORM_H
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
index 4c1e00528923..c4fa1e3c1f6f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.cpp
@@ -71,14 +71,8 @@
 #include <semaphore.h>
 #include <signal.h>
 #include <stddef.h>
-#include <md5.h>
-#include <sha224.h>
-#include <sha256.h>
-#include <sha384.h>
-#include <sha512.h>
 #include <stdio.h>
 #include <stringlist.h>
-#include <term.h>
 #include <termios.h>
 #include <time.h>
 #include <ttyent.h>
@@ -370,22 +364,6 @@ const int si_SEGV_MAPERR = SEGV_MAPERR;
 const int si_SEGV_ACCERR = SEGV_ACCERR;
 const int unvis_valid = UNVIS_VALID;
 const int unvis_validpush = UNVIS_VALIDPUSH;
-
-const unsigned MD5_CTX_sz = sizeof(MD5_CTX);
-const unsigned MD5_return_length = MD5_DIGEST_STRING_LENGTH;
-
-#define SHA2_CONST(LEN)                                                      \
-  const unsigned SHA##LEN##_CTX_sz = sizeof(SHA##LEN##_CTX);                 \
-  const unsigned SHA##LEN##_return_length = SHA##LEN##_DIGEST_STRING_LENGTH; \
-  const unsigned SHA##LEN##_block_length = SHA##LEN##_BLOCK_LENGTH;          \
-  const unsigned SHA##LEN##_digest_length = SHA##LEN##_DIGEST_LENGTH
-
-SHA2_CONST(224);
-SHA2_CONST(256);
-SHA2_CONST(384);
-SHA2_CONST(512);
-
-#undef SHA2_CONST
 }  // namespace __sanitizer
 
 using namespace __sanitizer;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
index 382b67ce78eb..1cbb40e0b2ff 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_freebsd.h
@@ -710,22 +710,6 @@ extern unsigned IOCTL_KDSKBMODE;
 extern const int si_SEGV_MAPERR;
 extern const int si_SEGV_ACCERR;
 
-extern const unsigned MD5_CTX_sz;
-extern const unsigned MD5_return_length;
-
-#define SHA2_EXTERN(LEN)                          \
-  extern const unsigned SHA##LEN##_CTX_sz;        \
-  extern const unsigned SHA##LEN##_return_length; \
-  extern const unsigned SHA##LEN##_block_length;  \
-  extern const unsigned SHA##LEN##_digest_length
-
-SHA2_EXTERN(224);
-SHA2_EXTERN(256);
-SHA2_EXTERN(384);
-SHA2_EXTERN(512);
-
-#undef SHA2_EXTERN
-
 struct __sanitizer_cap_rights {
   u64 cr_rights[2];
 };
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp
index aacd28c55cea..435f3b2861dc 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_netbsd.cpp
@@ -498,7 +498,6 @@ struct urio_command {
 #include <md5.h>
 #include <rmd160.h>
 #include <soundcard.h>
-#include <term.h>
 #include <termios.h>
 #include <time.h>
 #include <ttyent.h>
@@ -515,7 +514,7 @@ struct urio_command {
 #include <stringlist.h>
 
 #if defined(__x86_64__)
-#include <nvmm.h>
+#include <dev/nvmm/nvmm_ioctl.h>
 #endif
 // clang-format on
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 7a89bf1c7498..ea8cc306268c 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -779,16 +779,16 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
   unsigned IOCTL_SOUND_PCM_WRITE_FILTER = SOUND_PCM_WRITE_FILTER;
 #endif // SOUND_VERSION
   unsigned IOCTL_TCFLSH = TCFLSH;
-  unsigned IOCTL_TCGETA = TCGETA;
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   unsigned IOCTL_TCGETS = TCGETS;
+#    endif
   unsigned IOCTL_TCSBRK = TCSBRK;
   unsigned IOCTL_TCSBRKP = TCSBRKP;
-  unsigned IOCTL_TCSETA = TCSETA;
-  unsigned IOCTL_TCSETAF = TCSETAF;
-  unsigned IOCTL_TCSETAW = TCSETAW;
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
   unsigned IOCTL_TCSETS = TCSETS;
   unsigned IOCTL_TCSETSF = TCSETSF;
   unsigned IOCTL_TCSETSW = TCSETSW;
+#    endif
   unsigned IOCTL_TCXONC = TCXONC;
   unsigned IOCTL_TIOCGLCKTRMIOS = TIOCGLCKTRMIOS;
   unsigned IOCTL_TIOCGSOFTCAR = TIOCGSOFTCAR;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index a2b6c37d5450..f118d53f0df8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -102,6 +102,8 @@ const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID
                                            ? FIRST_32_SECOND_64(104, 128)
 #      if defined(_ABIN32) && _MIPS_SIM == _ABIN32
                                            : FIRST_32_SECOND_64(176, 216);
+#      elif SANITIZER_MUSL
+                                           : FIRST_32_SECOND_64(160, 208);
 #      else
                                            : FIRST_32_SECOND_64(160, 216);
 #      endif
@@ -1312,16 +1314,14 @@ extern unsigned IOCTL_SNDCTL_COPR_SENDMSG;
 extern unsigned IOCTL_SNDCTL_COPR_WCODE;
 extern unsigned IOCTL_SNDCTL_COPR_WDATA;
 extern unsigned IOCTL_TCFLSH;
-extern unsigned IOCTL_TCGETA;
-extern unsigned IOCTL_TCGETS;
 extern unsigned IOCTL_TCSBRK;
 extern unsigned IOCTL_TCSBRKP;
-extern unsigned IOCTL_TCSETA;
-extern unsigned IOCTL_TCSETAF;
-extern unsigned IOCTL_TCSETAW;
+#    if SANITIZER_TERMIOS_IOCTL_CONSTANTS
+extern unsigned IOCTL_TCGETS;
 extern unsigned IOCTL_TCSETS;
 extern unsigned IOCTL_TCSETSF;
 extern unsigned IOCTL_TCSETSW;
+#    endif
 extern unsigned IOCTL_TCXONC;
 extern unsigned IOCTL_TIOCGLCKTRMIOS;
 extern unsigned IOCTL_TIOCGSOFTCAR;
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 05065444a70c..612317b3c329 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -183,7 +183,8 @@ TEST_F(ScudoWrappersCDeathTest, Malloc) {
   // process doing free(P) is not a double free.
   EXPECT_DEATH(
       {
-        void *Ptr = malloc(Size);
+        // Note: volatile here prevents the calls from being optimized out.
+        void *volatile Ptr = malloc(Size);
         free(Ptr);
         free(Ptr);
       },
diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
index 7d920bee4a2d..f1d11a3e7f54 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
+++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_aarch64.S
@@ -222,6 +222,6 @@ ASM_SIZE(ASM_SYMBOL_INTERCEPTOR(__sigsetjmp))
 
 NO_EXEC_STACK_DIRECTIVE
 
-GNU_PROPERTY_BTI_PAC
+GNU_PROPERTY_BTI_PAC_GCS
 
 #endif
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 877718c703ba..f5576ce0e013 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -503,7 +503,7 @@ def get_ios_commands_dir():
 # Define %arch to check for architecture-dependent output.
 config.substitutions.append(("%arch", (config.host_arch)))
 
-if config.host_os == "Windows":
+if os.name == "nt":
     # FIXME: This isn't quite right. Specifically, it will succeed if the program
     # does not crash but exits with a non-zero exit code. We ought to merge
     # KillTheDoctor and not --crash to make the latter more useful and remove the
diff --git a/cross-project-tests/CMakeLists.txt b/cross-project-tests/CMakeLists.txt
index b4b1f4762607..192db8704317 100644
--- a/cross-project-tests/CMakeLists.txt
+++ b/cross-project-tests/CMakeLists.txt
@@ -19,11 +19,12 @@ set(CROSS_PROJECT_TEST_DEPS
   FileCheck
   check-gdb-llvm-support
   count
-  llvm-dwarfdump
+  llvm-ar
   llvm-config
+  llvm-dwarfdump
   llvm-objdump
-  split-file
   not
+  split-file
   )
 
 if ("clang" IN_LIST LLVM_ENABLE_PROJECTS)
diff --git a/cross-project-tests/dtlto/ld-archive-thin.test b/cross-project-tests/dtlto/ld-archive-thin.test
new file mode 100644
index 000000000000..979da5423962
--- /dev/null
+++ b/cross-project-tests/dtlto/ld-archive-thin.test
@@ -0,0 +1,97 @@
+REQUIRES: ld.lld,llvm-ar
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-linux-gnu -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: llvm-ar rcs foo.a foo.o --thin
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: llvm-ar rcs lib/bar.a bar.o --thin
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: llvm-ar rcs dog.a %t/lib/../dog.o --thin
+## The bitcode member of cat.a will not be used in the link.
+RUN: llvm-ar rcs cat.a cat.o --thin
+RUN: llvm-ar rcs start.a start.o --thin
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+
+RUN: %clang --target=x86_64-linux-gnu -flto=thin -fuse-ld=lld %t/foo.a %t/lib/bar.a ../start.a %t/cat.a \
+RUN:   -Wl,--whole-archive ../dog.a \
+RUN:   -fthinlto-distributor=%python \
+RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+RUN:   -Wl,--save-temps -nostdlib -Werror
+
+## Check that the required output files have been created.
+RUN: ls | sort | FileCheck %s
+
+## No files are expected before.
+CHECK-NOT: {{.}}
+
+## JSON jobs description.
+CHECK: {{^}}a.[[PID:[a-zA-Z0-9_]+]].dist-file.json{{$}}
+
+## Native output object files and individual summary index files.
+CHECK: {{^}}bar.3.[[PID]].native.o{{$}}
+CHECK: {{^}}bar.3.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o{{$}}
+CHECK: {{^}}dog.1.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o{{$}}
+CHECK: {{^}}foo.2.[[PID]].native.o.thinlto.bc{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o{{$}}
+CHECK: {{^}}start.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## No files are expected after.
+CHECK-NOT: {{.}}
+
+
+## It is important that cross-module inlining occurs for this test to show that Clang can
+## successfully load the bitcode file dependencies recorded in the summary indices.
+## Explicitly check that the expected importing has occurred.
+
+RUN: llvm-dis start.4.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis dog.1.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,DOG,START
+
+RUN: llvm-dis foo.2.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis bar.3.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+FOO-DAG:   foo.o
+BAR-DAG:   bar.o
+DOG-DAG:   dog.o
+START-DAG: start.o
+
+
+#--- foo.c
+extern int bar(int), _start(int);
+__attribute__((retain)) int foo(int x) { return x + bar(x) + _start(x); }
+
+#--- bar.c
+extern int foo(int), _start(int);
+__attribute__((retain)) int bar(int x) { return x + foo(x) + _start(x); }
+
+#--- dog.c
+extern int foo(int), bar(int), _start(int);
+__attribute__((retain)) int dog(int x) { return x + foo(x) + bar(x) + _start(x); }
+
+#--- cat.c
+__attribute__((retain)) void cat(int x) {}
+
+#--- start.c
+extern int foo(int), bar(int);
+__attribute__((retain)) int _start(int x) { return x + foo(x) + bar(x); }
diff --git a/cross-project-tests/dtlto/ld-dtlto.c b/cross-project-tests/dtlto/ld-dtlto.c
index 3ee962346bd4..7dffe2e015bc 100644
--- a/cross-project-tests/dtlto/ld-dtlto.c
+++ b/cross-project-tests/dtlto/ld-dtlto.c
@@ -5,13 +5,11 @@
 
 // RUN: rm -rf %t && mkdir %t && cd %t
 
-// RUN: %clang --target=x86_64-linux-gnu -c -flto=thin %s -o dtlto.o
-
-// RUN: ld.lld dtlto.o \
-// RUN:   --thinlto-distributor=%python \
-// RUN:   --thinlto-distributor-arg=%llvm_src_root/utils/dtlto/local.py \
-// RUN:   --thinlto-remote-compiler=%clang \
-// RUN:   --thinlto-remote-compiler-arg=--save-temps
+// RUN: %clang --target=x86_64-linux-gnu %s -flto=thin -fuse-ld=lld \
+// RUN:   -fthinlto-distributor=%python \
+// RUN:   -Xthinlto-distributor=%llvm_src_root/utils/dtlto/local.py \
+// RUN:   -Wl,--thinlto-remote-compiler-arg=--save-temps \
+// RUN:   -nostdlib -Werror
 
 /// Check that the required output files have been created.
 // RUN: ls | sort | FileCheck %s
@@ -22,18 +20,15 @@
 /// Linked ELF.
 // CHECK: {{^}}a.out{{$}}
 
-/// Produced by the bitcode compilation.
-// CHECK-NEXT: {{^}}dtlto.o{{$}}
-
 /// --save-temps output for the backend compilation.
-// CHECK-NEXT: {{^}}dtlto.s{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.0.preopt.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.1.promote.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.2.internalize.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.3.import.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.4.opt.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.5.precodegen.bc{{$}}
-// CHECK-NEXT: {{^}}dtlto.s.resolution.txt{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP:[a-zA-Z0-9_]+]].s{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.0.preopt.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.1.promote.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.2.internalize.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.3.import.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.4.opt.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.5.precodegen.bc{{$}}
+// CHECK-NEXT: {{^}}ld-dtlto-[[TMP]].s.resolution.txt{{$}}
 
 /// No files are expected after.
 // CHECK-NOT: {{.}}
diff --git a/cross-project-tests/dtlto/link-archive-thin.test b/cross-project-tests/dtlto/link-archive-thin.test
new file mode 100644
index 000000000000..fbd8fd67300c
--- /dev/null
+++ b/cross-project-tests/dtlto/link-archive-thin.test
@@ -0,0 +1,93 @@
+REQUIRES: lld-link
+
+## Test that a DTLTO link succeeds and outputs the expected set of files
+## correctly when thin archives are present.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+## Compile bitcode. -O2 is required for cross-module importing.
+RUN: %clang -O2 --target=x86_64-pc-windows-msvc -flto=thin -c \
+RUN:   foo.c bar.c dog.c cat.c start.c
+
+## Generate thin archives.
+RUN: lld-link /lib /llvmlibthin /out:foo.lib foo.o
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../bar.o".
+RUN: mkdir lib
+RUN: lld-link /lib /llvmlibthin /out:lib/bar.lib bar.o
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: lld-link /lib /llvmlibthin /out:dog.lib %t/lib/../dog.o
+RUN: lld-link /lib /llvmlibthin /out:cat.lib cat.o
+RUN: lld-link /lib /llvmlibthin /out:start.lib start.o
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+RUN: lld-link /subsystem:console /machine:x64 /entry:start /out:my.exe  \
+RUN:   %t/foo.lib %t/lib/bar.lib ../start.lib %t/cat.lib \
+RUN:   /includeoptional:dog ../dog.lib \
+RUN:   -thinlto-distributor:%python \
+RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/local.py \
+RUN:   -thinlto-remote-compiler:%clang \
+RUN:   /lldsavetemps
+
+## Check that the required output files have been created.
+RUN: ls | FileCheck %s --check-prefix=OUTPUTS --implicit-check-not=cat
+
+## JSON jobs description.
+OUTPUTS-DAG: my.[[PID:[a-zA-Z0-9_]+]].dist-file.json
+
+## Individual summary index files.
+OUTPUTS-DAG: start.1.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   dog.2.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   foo.3.[[PID]].native.o.thinlto.bc{{$}}
+OUTPUTS-DAG:   bar.4.[[PID]].native.o.thinlto.bc{{$}}
+
+## Native output object files.
+OUTPUTS-DAG: start.1.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   dog.2.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   foo.3.[[PID]].native.o{{$}}
+OUTPUTS-DAG:   bar.4.[[PID]].native.o{{$}}
+
+
+## It is important that cross-module inlining occurs for this test to show that Clang can
+## successfully load the bitcode file dependencies recorded in the summary indices.
+## Explicitly check that the expected importing has occurred.
+
+RUN: llvm-dis start.1.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis dog.2.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,DOG,START
+
+RUN: llvm-dis foo.3.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+RUN: llvm-dis bar.4.*.native.o.thinlto.bc -o - | \
+RUN:   FileCheck %s --check-prefixes=FOO,BAR,START
+
+FOO-DAG:   foo.o
+BAR-DAG:   bar.o
+DOG-DAG:   dog.o
+START-DAG: start.o
+
+
+#--- foo.c
+extern int bar(int), start(int);
+__attribute__((retain)) int foo(int x) { return x + bar(x) + start(x); }
+
+#--- bar.c
+extern int foo(int), start(int);
+__attribute__((retain)) int bar(int x) { return x + foo(x) + start(x); }
+
+#--- dog.c
+extern int foo(int), bar(int), start(int);
+__attribute__((retain)) int dog(int x) { return x + foo(x) + bar(x) + start(x); }
+
+#--- cat.c
+__attribute__((retain)) void cat(int x) {}
+
+#--- start.c
+extern int foo(int), bar(int);
+__attribute__((retain)) int start(int x) { return x + foo(x) + bar(x); }
diff --git a/cross-project-tests/dtlto/link-dtlto.c b/cross-project-tests/dtlto/link-dtlto.c
new file mode 100644
index 000000000000..0ab4ec57f115
--- /dev/null
+++ b/cross-project-tests/dtlto/link-dtlto.c
@@ -0,0 +1,41 @@
+// REQUIRES: lld-link
+
+/// Simple test that DTLTO works with a single input bitcode file and that
+/// --save-temps can be applied to the remote compilation.
+
+// RUN: rm -rf %t && mkdir %t && cd %t
+
+// RUN: %clang --target=x86_64-pc-windows-msvc -c -flto=thin %s -o dtlto.obj
+
+// RUN: lld-link /subsystem:console /entry:_start dtlto.obj \
+// RUN:   -thinlto-distributor:%python \
+// RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/local.py \
+// RUN:   -thinlto-remote-compiler:%clang \
+// RUN:   -thinlto-remote-compiler-arg:--save-temps
+
+/// Check that the required output files have been created.
+// RUN: ls | sort | FileCheck %s
+
+/// No files are expected before.
+// CHECK-NOT: {{.}}
+
+/// Linked ELF.
+// CHECK: {{^}}dtlto.exe{{$}}
+
+/// Produced by the bitcode compilation.
+// CHECK-NEXT: {{^}}dtlto.obj{{$}}
+
+/// --save-temps output for the backend compilation.
+// CHECK-NEXT: {{^}}dtlto.s{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.0.preopt.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.1.promote.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.2.internalize.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.3.import.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.4.opt.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.5.precodegen.bc{{$}}
+// CHECK-NEXT: {{^}}dtlto.s.resolution.txt{{$}}
+
+/// No files are expected after.
+// CHECK-NOT: {{.}}
+
+int _start() { return 0; }
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index b35c643ac898..ac2775347264 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -19,7 +19,7 @@
 config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = [".c", ".cl", ".cpp", ".m"]
+config.suffixes = [".c", ".cl", ".cpp", ".m", ".test"]
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
 # subdirectories contain auxiliary inputs for various tests in their parent
@@ -107,6 +107,8 @@ def get_required_attr(config, attr_name):
 if lldb_path is not None:
     config.available_features.add("lldb")
 
+if llvm_config.use_llvm_tool("llvm-ar"):
+    config.available_features.add("llvm-ar")
 
 def configure_dexter_substitutions():
     """Configure substitutions for host platform and return list of dependencies"""
diff --git a/flang-rt/cmake/modules/AddFlangRT.cmake b/flang-rt/cmake/modules/AddFlangRT.cmake
index e51590fdae3d..58541609829c 100644
--- a/flang-rt/cmake/modules/AddFlangRT.cmake
+++ b/flang-rt/cmake/modules/AddFlangRT.cmake
@@ -286,27 +286,6 @@ function (add_flangrt_library name)
       target_compile_options(${tgtname} PUBLIC -U_LIBCPP_ENABLE_ASSERTIONS)
     endif ()
 
-    # Flang/Clang (including clang-cl) -compiled programs targeting the MSVC ABI
-    # should only depend on msvcrt/ucrt. LLVM still emits libgcc/compiler-rt
-    # functions in some cases like 128-bit integer math (__udivti3, __modti3,
-    # __fixsfti, __floattidf, ...) that msvc does not support. We are injecting a
-    # dependency to Compiler-RT's builtin library where these are implemented.
-    if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-      if (FLANG_RT_BUILTINS_LIBRARY)
-        target_compile_options(${tgtname} PRIVATE "$<$<COMPILE_LANGUAGE:CXX,C>:-Xclang>" "$<$<COMPILE_LANGUAGE:CXX,C>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
-      endif ()
-    endif ()
-    if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-      if (FLANG_RT_BUILTINS_LIBRARY)
-        target_compile_options(${tgtname} PRIVATE "$<$<COMPILE_LANGUAGE:Fortran>:-Xflang>" "$<$<COMPILE_LANGUAGE:Fortran>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
-      else ()
-        message(WARNING "Did not find libclang_rt.builtins.lib.
-          LLVM may emit builtins that are not implemented in msvcrt/ucrt and
-          instead falls back to builtins from Compiler-RT. Linking with ${tgtname}
-          may result in a linker error.")
-      endif ()
-    endif ()
-
     # Non-GTest unittests depend on LLVMSupport
     if (ARG_LINK_TO_LLVM)
       if (LLVM_LINK_LLVM_DYLIB)
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index 332c0872e065..dc2db1d9902c 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -251,19 +251,33 @@ else()
   add_win_flangrt_runtime(STATIC dynamic     MultiThreadedDLL      INSTALL_WITH_TOOLCHAIN)
   add_win_flangrt_runtime(STATIC dynamic_dbg MultiThreadedDebugDLL INSTALL_WITH_TOOLCHAIN)
 
-  # Unittests link against LLVMSupport which is using CMake's default runtime
-  # library selection, which is either MultiThreadedDLL or MultiThreadedDebugDLL
-  # depending on the configuration. They have to match or linking will fail.
+  # Unittests link against LLVMSupport. If CMAKE_MSVC_RUNTIME_LIBRARY is set,
+  # that will have been used for LLVMSupport so it must also be used here.
+  # Otherwise this will use CMake's default runtime library selection, which
+  # is either MultiThreadedDLL or MultiThreadedDebugDLL depending on the configuration.
+  # They have to match or linking will fail.
   if (GENERATOR_IS_MULTI_CONFIG)
     # We cannot select an ALIAS library because it may be different
     # per configuration. Fallback to CMake's default.
     add_win_flangrt_runtime(STATIC unittest "" EXCLUDE_FROM_ALL)
   else ()
-    string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
-    if (build_type STREQUAL "debug")
-      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
-    else ()
-      add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
-    endif ()
+    # Check if CMAKE_MSVC_RUNTIME_LIBRARY was set.
+    if (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreaded")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.static)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDLL")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebug")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.static_dbg)
+    elseif (CMAKE_MSVC_RUNTIME_LIBRARY STREQUAL "MultiThreadedDebugDLL")
+        add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+    else()
+      # Default based on the build type.
+      string(TOLOWER ${CMAKE_BUILD_TYPE} build_type)
+      if (build_type STREQUAL "debug")
+          add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic_dbg)
+      else ()
+          add_library(flang_rt.runtime.unittest ALIAS flang_rt.runtime.dynamic)
+      endif ()
+    endif()
   endif ()
 endif()
diff --git a/flang-rt/unittests/CMakeLists.txt b/flang-rt/unittests/CMakeLists.txt
index 528219617413..fd63ad11dcf4 100644
--- a/flang-rt/unittests/CMakeLists.txt
+++ b/flang-rt/unittests/CMakeLists.txt
@@ -60,6 +60,27 @@ function(add_flangrt_unittest_offload_properties target)
   endif()
 endfunction()
 
+# flang-rt on Windows requires compiler-rt for some symbols. For binaries built
+# with flang this dependency is added by the flang driver, but since the unit
+# tests are built with clang we need to add the dependency manually.
+function(add_flangrt_dependent_libs target)
+  if (MSVC AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    if (FLANG_RT_BUILTINS_LIBRARY)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CXX,C>:-Xclang>" "$<$<COMPILE_LANGUAGE:CXX,C>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
+    endif ()
+  endif ()
+  if (MSVC AND CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+    if (FLANG_RT_BUILTINS_LIBRARY)
+      target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:Fortran>:-Xflang>" "$<$<COMPILE_LANGUAGE:Fortran>:--dependent-lib=${FLANG_RT_BUILTINS_LIBRARY}>")
+    else ()
+      message(WARNING "Did not find libclang_rt.builtins.lib.
+        LLVM may emit builtins that are not implemented in msvcrt/ucrt and
+        instead falls back to builtins from Compiler-RT. Linking with ${tgtname}
+        may result in a linker error.")
+    endif ()
+  endif ()
+endfunction()
+
 
 function(add_flangrt_unittest test_dirname)
   cmake_parse_arguments(ARG
@@ -72,14 +93,7 @@ function(add_flangrt_unittest test_dirname)
 
   target_link_libraries(${test_dirname} PRIVATE ${ARG_LINK_LIBS})
   add_flangrt_unittest_offload_properties(${test_dirname})
-
-  # Required because LLVMSupport is compiled with this option.
-  # FIXME: According to CMake documentation, this is the default. Why is it
-  #        needed? LLVM's add_unittest doesn't set it either.
-  set_target_properties(${test_dirname}
-      PROPERTIES
-        MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>DLL"
-    )
+  add_flangrt_dependent_libs(${test_dirname})
 endfunction()
 
 function(add_flangrt_nongtest_unittest test_name)
@@ -99,6 +113,7 @@ function(add_flangrt_nongtest_unittest test_name)
   set_target_properties(${test_name}${suffix} PROPERTIES FOLDER "Flang-RT/Tests/Unit")
 
   target_link_libraries(${test_name}${suffix} PRIVATE NonGTestTesting ${ARG_LINK_LIBS})
+  add_flangrt_dependent_libs(${test_name}${suffix})
 
   if(NOT ARG_SLOW_TEST)
     add_dependencies(FlangRTUnitTests ${test_name}${suffix})
diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md
index c9f19c37fd7f..81f5f9f6dee5 100644
--- a/flang/docs/OpenMPSupport.md
+++ b/flang/docs/OpenMPSupport.md
@@ -41,7 +41,7 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
 | target construct                                           | P      | device clause not supported |
 | target update construct                                    | P      | device clause not supported |
 | declare target directive                                   | P      | |
-| teams construct                                            | P      | reduction clause not supported |
+| teams construct                                            | Y      | |
 | distribute construct                                       | P      | dist_schedule clause not supported |
 | distribute simd construct                                  | P      | dist_schedule and linear clauses are not supported |
 | distribute parallel loop construct                         | P      | dist_schedule clause not supported |
@@ -51,15 +51,15 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low
 | atomic construct extensions                                | Y      | |
 | cancel construct                                           | Y      | |
 | cancellation point construct                               | Y      | |
-| parallel do simd construct                                 | P      | linear clause is not supported |
-| target teams construct                                     | P      | device and reduction clauses are not supported |
-| teams distribute construct                                 | P      | reduction and dist_schedule clauses not supported |
-| teams distribute simd construct                            | P      | reduction, dist_schedule and linear clauses are not supported |
-| target teams distribute construct                          | P      | device, reduction and dist_schedule clauses are not supported |
-| teams distribute parallel loop construct                   | P      | reduction and dist_schedule clauses are not supported |
-| target teams distribute parallel loop construct            | P      | device, reduction and dist_schedule clauses are not supported |
-| teams distribute parallel loop simd construct              | P      | reduction, dist_schedule, and linear clauses are not supported |
-| target teams distribute parallel loop simd construct       | P      | device, reduction, dist_schedule and linear clauses are not supported |
+| parallel do simd construct                                 | P      | linear clause not supported |
+| target teams construct                                     | P      | device clause not supported |
+| teams distribute construct                                 | P      | dist_schedule clause not supported |
+| teams distribute simd construct                            | P      | dist_schedule and linear clauses are not supported |
+| target teams distribute construct                          | P      | device and dist_schedule clauses are not supported |
+| teams distribute parallel loop construct                   | P      | dist_schedule clause not supported |
+| target teams distribute parallel loop construct            | P      | device and dist_schedule clauses are not supported |
+| teams distribute parallel loop simd construct              | P      | dist_schedule and linear clauses are not supported |
+| target teams distribute parallel loop simd construct       | P      | device, dist_schedule and linear clauses are not supported |
 
 ## Extensions
 ### ATOMIC construct
diff --git a/flang/include/flang/Lower/ConvertType.h b/flang/include/flang/Lower/ConvertType.h
index 179a68258404..3c726595c0f7 100644
--- a/flang/include/flang/Lower/ConvertType.h
+++ b/flang/include/flang/Lower/ConvertType.h
@@ -118,6 +118,9 @@ class ComponentReverseIterator {
   /// Advance iterator to the last components of the current type parent.
   const Fortran::semantics::DerivedTypeSpec &advanceToParentType();
 
+  /// Get the parent component symbol for the current type.
+  const Fortran::semantics::Symbol *getParentComponent() const;
+
 private:
   void setCurrentType(const Fortran::semantics::DerivedTypeSpec &derived);
   const Fortran::semantics::DerivedTypeSpec *currentParentType = nullptr;
diff --git a/flang/include/flang/Lower/Support/ReductionProcessor.h b/flang/include/flang/Lower/Support/ReductionProcessor.h
index 72d8a0096f51..0b49049d43ed 100644
--- a/flang/include/flang/Lower/Support/ReductionProcessor.h
+++ b/flang/include/flang/Lower/Support/ReductionProcessor.h
@@ -124,7 +124,7 @@ class ReductionProcessor {
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
   template <typename OpType, typename RedOperatorListTy>
-  static void processReductionArguments(
+  static bool processReductionArguments(
       mlir::Location currentLocation, lower::AbstractConverter &converter,
       const RedOperatorListTy &redOperatorList,
       llvm::SmallVectorImpl<mlir::Value> &reductionVars,
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 33c1f1e7a3c3..2105fc14b0a6 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2123,9 +2123,10 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     llvm::SmallVector<mlir::Value> reduceVars;
     Fortran::lower::omp::ReductionProcessor rp;
-    rp.processReductionArguments<fir::DeclareReductionOp>(
+    bool result = rp.processReductionArguments<fir::DeclareReductionOp>(
         toLocation(), *this, info.reduceOperatorList, reduceVars,
         reduceVarByRef, reductionDeclSymbols, info.reduceSymList);
+    assert(result && "Failed to process `do concurrent` reductions");
 
     doConcurrentLoopOp.getReduceVarsMutable().assign(reduceVars);
     doConcurrentLoopOp.setReduceSymsAttr(
@@ -5504,10 +5505,34 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   void genFIR(const Fortran::parser::AssignStmt &stmt) {
     const Fortran::semantics::Symbol &symbol =
         *std::get<Fortran::parser::Name>(stmt.t).symbol;
+
     mlir::Location loc = toLocation();
+    mlir::Type symbolType = genType(symbol);
+    mlir::Value addr = getSymbolAddress(symbol);
+
+    // Handle the case where the assigned variable is declared as a pointer
+    if (auto eleTy = fir::dyn_cast_ptrOrBoxEleTy(symbolType)) {
+      if (auto ptrType = mlir::dyn_cast<fir::PointerType>(eleTy)) {
+        symbolType = ptrType.getEleTy();
+      } else {
+        symbolType = eleTy;
+      }
+    } else if (auto ptrType = mlir::dyn_cast<fir::PointerType>(symbolType)) {
+      symbolType = ptrType.getEleTy();
+    }
+
     mlir::Value labelValue = builder->createIntegerConstant(
-        loc, genType(symbol), std::get<Fortran::parser::Label>(stmt.t));
-    builder->create<fir::StoreOp>(loc, labelValue, getSymbolAddress(symbol));
+        loc, symbolType, std::get<Fortran::parser::Label>(stmt.t));
+
+    // If the address points to a boxed pointer, we need to dereference it
+    if (auto refType = mlir::dyn_cast<fir::ReferenceType>(addr.getType())) {
+      if (auto boxType = mlir::dyn_cast<fir::BoxType>(refType.getEleTy())) {
+        mlir::Value boxValue = builder->create<fir::LoadOp>(loc, addr);
+        addr = builder->create<fir::BoxAddrOp>(loc, boxValue);
+      }
+    }
+
+    builder->create<fir::StoreOp>(loc, labelValue, addr);
   }
 
   void genFIR(const Fortran::parser::FormatStmt &) {
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 9689f920840f..906a68982188 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -1848,8 +1848,15 @@ class HlfirBuilder {
       for (Fortran::lower::ComponentReverseIterator compIterator(
                ctor.result().derivedTypeSpec());
            !compIterator.lookup(compSym.name());) {
-        const auto &parentType = compIterator.advanceToParentType();
-        llvm::StringRef parentName = toStringRef(parentType.name());
+        // Private parent components have mangled names. Get the name from the
+        // parent symbol.
+        const Fortran::semantics::Symbol *parentCompSym =
+            compIterator.getParentComponent();
+        assert(parentCompSym && "failed to get parent component symbol");
+        std::string parentName =
+            converter.getRecordTypeFieldName(*parentCompSym);
+        // Advance the iterator, but don't use its return value.
+        compIterator.advanceToParentType();
         auto baseRecTy = mlir::cast<fir::RecordType>(
             hlfir::getFortranElementType(currentParent.getType()));
         auto parentCompType = baseRecTy.getType(parentName);
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index 7a2e8e509518..0fde61465fb8 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -669,6 +669,18 @@ Fortran::lower::ComponentReverseIterator::advanceToParentType() {
   return *currentParentType;
 }
 
+const Fortran::semantics::Symbol *
+Fortran::lower::ComponentReverseIterator::getParentComponent() const {
+  if (!currentTypeDetails->GetParentComponentName())
+    return nullptr;
+  const Fortran::semantics::Scope *scope = currentParentType->GetScope();
+  auto parentComp =
+      DEREF(scope).find(currentTypeDetails->GetParentComponentName().value());
+  if (parentComp == scope->cend())
+    return nullptr;
+  return &*parentComp->second;
+}
+
 void Fortran::lower::ComponentReverseIterator::setCurrentType(
     const Fortran::semantics::DerivedTypeSpec &derived) {
   currentParentType = &derived;
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 63a612d7ead6..d64449373fca 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -464,8 +464,10 @@ getNamelistGroup(Fortran::lower::AbstractConverter &converter,
       fir::BoxType boxTy =
           fir::BoxType::get(fir::PointerType::get(converter.genType(s)));
       auto descFunc = [&](fir::FirOpBuilder &b) {
+        bool couldBeInEquivalence =
+            Fortran::semantics::FindEquivalenceSet(s) != nullptr;
         auto box = Fortran::lower::genInitialDataTarget(
-            converter, loc, boxTy, *expr, /*couldBeInEquivalence=*/true);
+            converter, loc, boxTy, *expr, couldBeInEquivalence);
         b.create<fir::HasValueOp>(loc, box);
       };
       builder.createGlobalConstant(loc, boxTy, mangleName, descFunc, linkOnce);
diff --git a/flang/lib/Lower/Mangler.cpp b/flang/lib/Lower/Mangler.cpp
index 1333e3fe349d..e1ae86a1b5bb 100644
--- a/flang/lib/Lower/Mangler.cpp
+++ b/flang/lib/Lower/Mangler.cpp
@@ -224,8 +224,18 @@ std::string Fortran::lower::mangle::mangleName(
       assert(paramExpr && "derived type kind param not explicit");
       std::optional<int64_t> init =
           Fortran::evaluate::ToInt64(paramValue->GetExplicit());
-      assert(init && "derived type kind param is not constant");
-      kinds.emplace_back(*init);
+      // TODO: put the assertion check back when parametrized derived types
+      // are supported:
+      // assert(init && "derived type kind param is not constant");
+      //
+      // The init parameter above will require a FoldingContext for proper
+      // expression evaluation to an integer constant, otherwise the
+      // compiler may crash here (see example in issue #127424).
+      if (!init) {
+        TODO_NOLOC("parameterized derived types");
+      } else {
+        kinds.emplace_back(*init);
+      }
     }
   }
   return fir::NameUniquer::doType(modules, procs, blockId, symbolName, kinds);
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index 74087d42a8e6..40f735c33cc3 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -1116,11 +1116,12 @@ bool ClauseProcessor::processInReduction(
         collectReductionSyms(clause, inReductionSyms);
 
         ReductionProcessor rp;
-        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
-            currentLocation, converter,
-            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
-            inReductionVars, inReduceVarByRef, inReductionDeclSymbols,
-            inReductionSyms);
+        if (!rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+                currentLocation, converter,
+                std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+                inReductionVars, inReduceVarByRef, inReductionDeclSymbols,
+                inReductionSyms))
+          inReductionSyms.clear();
 
         // Copy local lists into the output.
         llvm::copy(inReductionVars, std::back_inserter(result.inReductionVars));
@@ -1461,10 +1462,12 @@ bool ClauseProcessor::processReduction(
         }
 
         ReductionProcessor rp;
-        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
-            currentLocation, converter,
-            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
-            reductionVars, reduceVarByRef, reductionDeclSymbols, reductionSyms);
+        if (!rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+                currentLocation, converter,
+                std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+                reductionVars, reduceVarByRef, reductionDeclSymbols,
+                reductionSyms))
+          reductionSyms.clear();
         // Copy local lists into the output.
         llvm::copy(reductionVars, std::back_inserter(result.reductionVars));
         llvm::copy(reduceVarByRef, std::back_inserter(result.reductionByref));
@@ -1486,11 +1489,12 @@ bool ClauseProcessor::processTaskReduction(
         collectReductionSyms(clause, taskReductionSyms);
 
         ReductionProcessor rp;
-        rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
-            currentLocation, converter,
-            std::get<typename omp::clause::ReductionOperatorList>(clause.t),
-            taskReductionVars, taskReduceVarByRef, taskReductionDeclSymbols,
-            taskReductionSyms);
+        if (!rp.processReductionArguments<mlir::omp::DeclareReductionOp>(
+                currentLocation, converter,
+                std::get<typename omp::clause::ReductionOperatorList>(clause.t),
+                taskReductionVars, taskReduceVarByRef, taskReductionDeclSymbols,
+                taskReductionSyms))
+          taskReductionSyms.clear();
         // Copy local lists into the output.
         llvm::copy(taskReductionVars,
                    std::back_inserter(result.taskReductionVars));
diff --git a/flang/lib/Lower/Support/ReductionProcessor.cpp b/flang/lib/Lower/Support/ReductionProcessor.cpp
index c0be1e229f82..f58a103b0347 100644
--- a/flang/lib/Lower/Support/ReductionProcessor.cpp
+++ b/flang/lib/Lower/Support/ReductionProcessor.cpp
@@ -39,7 +39,7 @@ namespace lower {
 namespace omp {
 
 // explicit template declarations
-template void ReductionProcessor::processReductionArguments<
+template bool ReductionProcessor::processReductionArguments<
     mlir::omp::DeclareReductionOp, omp::clause::ReductionOperatorList>(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
     const omp::clause::ReductionOperatorList &redOperatorList,
@@ -48,7 +48,7 @@ template void ReductionProcessor::processReductionArguments<
     llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols,
     const llvm::SmallVectorImpl<const semantics::Symbol *> &reductionSymbols);
 
-template void ReductionProcessor::processReductionArguments<
+template bool ReductionProcessor::processReductionArguments<
     fir::DeclareReductionOp, llvm::SmallVector<fir::ReduceOperationEnum>>(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
     const llvm::SmallVector<fir::ReduceOperationEnum> &redOperatorList,
@@ -606,7 +606,7 @@ static bool doReductionByRef(mlir::Value reductionVar) {
 }
 
 template <typename OpType, typename RedOperatorListTy>
-void ReductionProcessor::processReductionArguments(
+bool ReductionProcessor::processReductionArguments(
     mlir::Location currentLocation, lower::AbstractConverter &converter,
     const RedOperatorListTy &redOperatorList,
     llvm::SmallVectorImpl<mlir::Value> &reductionVars,
@@ -626,10 +626,10 @@ void ReductionProcessor::processReductionArguments(
               std::get_if<omp::clause::ProcedureDesignator>(&redOperator.u)) {
         if (!ReductionProcessor::supportedIntrinsicProcReduction(
                 *reductionIntrinsic)) {
-          return;
+          return false;
         }
       } else {
-        return;
+        return false;
       }
     }
   }
@@ -764,6 +764,8 @@ void ReductionProcessor::processReductionArguments(
 
   if (isDoConcurrent)
     builder.restoreInsertionPoint(dcIP);
+
+  return true;
 }
 
 const semantics::SourceName
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 2425265e196c..783aef18fb89 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -16,7 +16,6 @@
 #include "flang/Common/idioms.h"
 #include "flang/Common/indirection.h"
 #include "flang/Common/visit.h"
-#include "flang/Evaluate/shape.h"
 #include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/char-block.h"
@@ -4117,21 +4116,26 @@ void OmpStructureChecker::CheckArraySection(
   // Detect this by looking for array accesses on character variables which are
   // not arrays.
   bool isSubstring{false};
-  evaluate::ExpressionAnalyzer ea{context_};
-  if (MaybeExpr expr = ea.Analyze(arrayElement.base)) {
-    std::optional<evaluate::Shape> shape = evaluate::GetShape(expr);
-    // Not an array: rank 0
-    if (shape && shape->size() == 0) {
-      if (std::optional<evaluate::DynamicType> type = expr->GetType()) {
-        if (type->category() == evaluate::TypeCategory::Character) {
-          // Substrings are explicitly denied by the standard [6.0:163:9-11].
-          // This is supported as an extension. This restriction was added in
-          // OpenMP 5.2.
-          isSubstring = true;
-          context_.Say(GetContext().clauseSource,
-              "The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2."_port_en_US);
-        } else {
-          llvm_unreachable("Array indexing on a variable that isn't an array");
+  // Cannot analyze a base of an assumed-size array on its own. If we know
+  // this is an array (assumed-size or not) we can ignore it, since we're
+  // looking for strings.
+  if (!IsAssumedSizeArray(*name.symbol)) {
+    evaluate::ExpressionAnalyzer ea{context_};
+    if (MaybeExpr expr = ea.Analyze(arrayElement.base)) {
+      if (expr->Rank() == 0) {
+        // Not an array: rank 0
+        if (std::optional<evaluate::DynamicType> type = expr->GetType()) {
+          if (type->category() == evaluate::TypeCategory::Character) {
+            // Substrings are explicitly denied by the standard [6.0:163:9-11].
+            // This is supported as an extension. This restriction was added in
+            // OpenMP 5.2.
+            isSubstring = true;
+            context_.Say(GetContext().clauseSource,
+                "The use of substrings in OpenMP argument lists has been disallowed since OpenMP 5.2."_port_en_US);
+          } else {
+            llvm_unreachable(
+                "Array indexing on a variable that isn't an array");
+          }
         }
       }
     }
diff --git a/flang/test/Driver/loop-interchange.f90 b/flang/test/Driver/loop-interchange.f90
index 5d3ec71c5987..1e5a11902709 100644
--- a/flang/test/Driver/loop-interchange.f90
+++ b/flang/test/Driver/loop-interchange.f90
@@ -2,9 +2,9 @@
 ! RUN: %flang -### -S -fno-loop-interchange %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! RUN: %flang -### -S -O0 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! RUN: %flang -### -S -O1 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
-! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-INTERCHANGE %s
-! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-INTERCHANGE %s
-! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-LOOP-INTERCHANGE %s
+! RUN: %flang -### -S -O2 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
+! RUN: %flang -### -S -O3 %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
+! RUN: %flang -### -S -Os %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! RUN: %flang -### -S -Oz %s 2>&1 | FileCheck -check-prefix=CHECK-NO-LOOP-INTERCHANGE %s
 ! CHECK-LOOP-INTERCHANGE: "-floop-interchange"
 ! CHECK-NO-LOOP-INTERCHANGE-NOT: "-floop-interchange"
diff --git a/flang/test/Driver/target-cpu-features.f90 b/flang/test/Driver/target-cpu-features.f90
index 5a3fd0d83800..e7da964184c8 100644
--- a/flang/test/Driver/target-cpu-features.f90
+++ b/flang/test/Driver/target-cpu-features.f90
@@ -44,6 +44,10 @@
 ! RUN: %flang --target=loongarch64-linux-gnu -c %s -### 2>&1 \
 ! RUN: | FileCheck %s -check-prefix=CHECK-LOONGARCH64
 
+! RUN: %flang --target=sparc64-linux-gnu -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+! RUN: %flang --target=sparc64-freebsd -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+! RUN: %flang --target=sparc64-openbsd -c -### %s 2>&1  | FileCheck %s -check-prefix=CHECK-SPARC-VIS
+
 ! CHECK-A57: "-fc1" "-triple" "aarch64-unknown-linux-gnu"
 ! CHECK-A57-SAME: "-target-cpu" "cortex-a57"
 ! CHECK-A57-SAME: "-target-feature" "+v8a" "-target-feature" "+aes" "-target-feature" "+crc" "-target-feature" "+fp-armv8" "-target-feature" "+neon" "-target-feature" "+perfmon" "-target-feature" "+sha2
@@ -92,3 +96,6 @@
 
 ! CHECK-LOONGARCH64: "-fc1" "-triple" "loongarch64-unknown-linux-gnu"
 ! CHECK-LOONGARCH64-SAME: "-target-cpu" "loongarch64" "-target-feature" "+lsx" "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+ual"
+
+! CHECK-SPARC-VIS: "-fc1" "-triple" "sparc64-{{[^"]+}}"
+! CHECK-SPARC-VIS-SAME: "-target-feature" "+vis"
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-non-intrinsic.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-non-intrinsic.f90
new file mode 100644
index 000000000000..70b2ae1111a5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-non-intrinsic.f90
@@ -0,0 +1,25 @@
+! Tests reduction processor behavior when a reduction symbol is not supported.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+subroutine foo
+  implicit none
+  integer :: k, i
+
+  interface max
+    function max(m1,m2)
+      integer :: m1, m2
+    end function
+  end interface
+
+  !$omp do reduction (max: k)
+  do i=1,10
+  end do
+  !$omp end do
+end
+
+! Verify that unsupported reduction is ignored.
+! CHECK: omp.wsloop 
+! CHECK-SAME: private(@{{[^[:space:]]+}} %{{[^[:space:]]+}}
+! CHECK-SAME:         -> %{{[^[:space:]]+}} : !{{[^[:space:]]+}}) {
+! CHECK: }
diff --git a/flang/test/Lower/assign-statement.f90 b/flang/test/Lower/assign-statement.f90
new file mode 100644
index 000000000000..342355bf469c
--- /dev/null
+++ b/flang/test/Lower/assign-statement.f90
@@ -0,0 +1,12 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+  ! CHECK-LABEL: func @_QQmain
+  program main
+  integer :: ip
+  pointer :: ip
+
+  allocate(ip)
+  assign 10 to ip
+  ! CHECK: fir.store %c10_i32 to %11 : !fir.ptr<i32>
+  10 return
+  end program main
diff --git a/flang/test/Lower/derived-type-private.f90 b/flang/test/Lower/derived-type-private.f90
new file mode 100644
index 000000000000..8edcdeedad8b
--- /dev/null
+++ b/flang/test/Lower/derived-type-private.f90
@@ -0,0 +1,29 @@
+! Test lowering of derived type with private attribute
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+program main
+  call test02()
+  print *,"pass"
+end program main
+
+module mod2
+  type,private:: tt
+     integer :: ip = 1
+  end type tt
+  type,extends(tt):: ty1
+  ! CHECK: fir.global @_QMmod2Estr : !fir.type<_QMmod2Tty1{_QMmod2Tty1.tt:!fir.type<_QMmod2Ttt{ip:i32}>,i1:i32,i1p:!fir.type<_QMmod2Ttt{ip:i32}>,i1a:!fir.box<!fir.heap<!fir.array<?xi32>>>}>
+     integer :: i1 = 1
+     type(tt) :: i1p = tt(2)
+     integer,allocatable :: i1a(:)
+  end type ty1
+  type(ty1) :: str
+end module mod2
+
+subroutine test02()
+  use mod2
+  integer,allocatable :: ia(:)
+  allocate(ia(10))
+  ia=2
+  str=ty1(i1a=ia)
+  if (str%i1.ne.1) print *,'ng'
+end subroutine test02
diff --git a/flang/test/Lower/equivalence-3.f b/flang/test/Lower/equivalence-3.f
new file mode 100644
index 000000000000..19f8880189f0
--- /dev/null
+++ b/flang/test/Lower/equivalence-3.f
@@ -0,0 +1,19 @@
+! RUN: bbc -emit-fir -o - %s | FileCheck %s
+
+  ! CHECK-LABEL: func @_QQmain
+  program main
+  real a1,a2
+  equivalence (a1,a2)
+  ! A fir.alloca should never appear in a global constant initialization.
+  ! CHECK: fir.global linkonce @_QFEx1.desc constant : !fir.box<!fir.ptr<!fir.array<5xf64>>>
+  ! CHECK: arith.constant 5 : index
+  ! CHECK:fir.address_of(@_QFEx1) : !fir.ref<!fir.array<5xf64>>
+  ! CHECK: fir.shape %c5 : (index) -> !fir.shape<1>
+  ! CHECK: fir.declare %0(%1) {uniq_name = "_QFEx1"} : (!fir.ref<!fir.array<5xf64>>, !fir.shape<1>) -> !fir.ref<!fir.array<5xf64>>
+  ! CHECK: fir.embox %2(%1) : (!fir.ref<!fir.array<5xf64>>, !fir.shape<1>) -> !fir.box<!fir.array<5xf64>>
+  ! CHECK: fir.rebox %3 : (!fir.box<!fir.array<5xf64>>) -> !fir.box<!fir.ptr<!fir.array<5xf64>>>
+  ! CHECK: fir.has_value %4 : !fir.box<!fir.ptr<!fir.array<5xf64>>>
+  real*8 x1(5)
+  namelist /y1/x1
+  read (5,y1)
+  end
diff --git a/flang/test/Lower/parametrized-derived-types.f90 b/flang/test/Lower/parametrized-derived-types.f90
new file mode 100644
index 000000000000..97a40c9169d2
--- /dev/null
+++ b/flang/test/Lower/parametrized-derived-types.f90
@@ -0,0 +1,19 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! XFAIL: *
+program main
+  TYPE ty(k1,k2)
+     INTEGER ,KIND::k1,k2=5
+     INTEGER::arr(k1:k2)=10
+     CHARACTER(LEN=k2)::CHARACTER
+  END TYPE ty
+  TYPE,EXTENDS(ty)::ty1(k3)
+     INTEGER,KIND ::k3=4
+     TYPE(ty(2,k3+1))::cmp_ty = ty(2,k3+1)(55,'HI')
+  END TYPE ty1
+  TYPE ty2(l1, l2)
+  !ERROR: not yet implemented: parameterized derived types
+     INTEGER,LEN ::l1,l2
+     TYPE(ty1(2,5)), ALLOCATABLE::ty1_cmp(:)
+  END TYPE ty2
+  TYPE(ty2(4,8)) ::ty2_obj
+end program main
diff --git a/flang/test/Semantics/OpenMP/assumed-size-array.f90 b/flang/test/Semantics/OpenMP/assumed-size-array.f90
new file mode 100644
index 000000000000..6e36db178dc8
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/assumed-size-array.f90
@@ -0,0 +1,25 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! This should compile without errors. Check for a symptom of a reasonable
+! output.
+
+!CHECK: omp.task depend
+
+subroutine omp_task_depend_reproducer(work, myid, shift)
+  implicit none
+  integer, intent(in) :: myid, shift
+  real, intent(inout) :: work(*)
+
+!$omp parallel shared(work, myid, shift)
+  !$omp single
+    !$omp task depend(in:work(myid+shift-1)) depend(in:work(myid-1)) depend(out:work(myid))
+      call dummy_kernel(work(myid))
+    !$omp end task
+  !$omp end single
+!$omp end parallel
+contains
+  subroutine dummy_kernel(x)
+    real :: x
+    x = x + 1.0
+  end subroutine dummy_kernel
+end subroutine omp_task_depend_reproducer
diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index 469266cc8155..7516157731b5 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -30,6 +30,11 @@ target_link_libraries(bbc PRIVATE
   flangFrontend
   flangPasses
   FlangOpenMPTransforms
+  FortranSupport
+  FortranParser
+  FortranEvaluate
+  FortranSemantics
+  FortranLower
 )
 
 mlir_target_link_libraries(bbc PRIVATE
@@ -37,9 +42,4 @@ mlir_target_link_libraries(bbc PRIVATE
   ${extension_libs}
   MLIRAffineToStandard
   MLIRSCFToControlFlow
-  FortranSupport
-  FortranParser
-  FortranEvaluate
-  FortranSemantics
-  FortranLower
 )
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 2570d1a106d2..c98e2043464d 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -84,9 +84,6 @@ else()
   endif()
 endif()
 
-# Setup the paths where libclc runtimes should be stored.
-set( LIBCLC_OUTPUT_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR} )
-
 if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} )
   message( WARNING "Using custom LLVM tools to build libclc: "
     "${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR}, "
@@ -164,14 +161,34 @@ endif()
 
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
+# Construct LLVM version define
+set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
+
 # This needs to be set before any target that needs it
 # We need to use LLVM_INCLUDE_DIRS here, because if we are linking to an
 # llvm build directory, this includes $src/llvm/include which is where all the
 # headers are not $build/include/ which is what LLVM_INCLUDE_DIR is set to.
 include_directories( ${LLVM_INCLUDE_DIRS} )
 
-# Configure prepare_builtins
-add_subdirectory( utils )
+# Setup prepare_builtins tools
+set(LLVM_LINK_COMPONENTS
+  BitReader
+  BitWriter
+  Core
+  IRReader
+  Support
+)
+if( LIBCLC_STANDALONE_BUILD )
+  add_llvm_executable( prepare_builtins utils/prepare-builtins.cpp )
+  set( prepare_builtins_exe prepare_builtins )
+  set( prepare_builtins_target prepare_builtins )
+else()
+  add_llvm_utility( prepare_builtins utils/prepare-builtins.cpp )
+  setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
+endif()
+target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
+# These were not properly reported in early LLVM and we don't need them
+target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
 
 # Setup arch devices
 set( r600--_devices cedar cypress barts cayman )
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index 056706ee629c..597bb642655e 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -120,14 +120,14 @@ function(link_bc)
   endif()
 
   add_custom_command(
-    OUTPUT ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc
-    COMMAND ${llvm-link_exe} ${link_flags} -o ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc ${LINK_INPUT_ARG}
+    OUTPUT ${ARG_TARGET}.bc
+    COMMAND ${llvm-link_exe} ${link_flags} -o ${ARG_TARGET}.bc ${LINK_INPUT_ARG}
     DEPENDS ${llvm-link_target} ${ARG_DEPENDENCIES} ${ARG_INPUTS} ${RSP_FILE}
   )
 
-  add_custom_target( ${ARG_TARGET} ALL DEPENDS ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc )
+  add_custom_target( ${ARG_TARGET} ALL DEPENDS ${ARG_TARGET}.bc )
   set_target_properties( ${ARG_TARGET} PROPERTIES
-    TARGET_FILE ${LIBCLC_ARCH_OBJFILE_DIR}/${ARG_TARGET}.bc
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${ARG_TARGET}.bc
     FOLDER "libclc/Device IR/Linking"
   )
 endfunction()
@@ -356,98 +356,65 @@ function(add_libclc_builtin_set)
 
   set( builtins_link_lib $<TARGET_PROPERTY:${builtins_link_lib_tgt},TARGET_FILE> )
 
-  # For SPIR-V targets we diverage at this point and generate SPIR-V using the
-  # llvm-spirv tool.
   if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 )
-    set( obj_suffix ${ARG_ARCH_SUFFIX}.spv )
-    set( libclc_builtins_lib ${LIBCLC_OUTPUT_LIBRARY_DIR}/${obj_suffix} )
-    add_custom_command( OUTPUT ${libclc_builtins_lib}
-      COMMAND ${llvm-spirv_exe} ${spvflags} -o ${libclc_builtins_lib} ${builtins_link_lib}
+    set( spv_suffix ${ARG_ARCH_SUFFIX}.spv )
+    add_custom_command( OUTPUT ${spv_suffix}
+      COMMAND ${llvm-spirv_exe} ${spvflags} -o ${spv_suffix} ${builtins_link_lib}
       DEPENDS ${llvm-spirv_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
     )
-  else()
-    # Non-SPIR-V targets add an extra step to optimize the bytecode
-    set( builtins_opt_lib_tgt builtins.opt.${ARG_ARCH_SUFFIX} )
+    add_custom_target( "prepare-${spv_suffix}" ALL DEPENDS "${spv_suffix}" )
+    set_target_properties( "prepare-${spv_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${spv_suffix}
+       DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
 
-    add_custom_command( OUTPUT ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-      COMMAND ${opt_exe} ${ARG_OPT_FLAGS} -o ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-        ${builtins_link_lib}
-      DEPENDS ${opt_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
-    )
-    add_custom_target( ${builtins_opt_lib_tgt}
-      ALL DEPENDS ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-    )
-    set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES
-      TARGET_FILE ${LIBCLC_ARCH_OBJFILE_DIR}/${builtins_opt_lib_tgt}.bc
-      FOLDER "libclc/Device IR/Opt"
-    )
-
-    set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
-
-    set( obj_suffix ${ARG_ARCH_SUFFIX}.bc )
-    set( libclc_builtins_lib ${LIBCLC_OUTPUT_LIBRARY_DIR}/${obj_suffix} )
-    add_custom_command( OUTPUT ${libclc_builtins_lib}
-      COMMAND ${prepare_builtins_exe} -o ${libclc_builtins_lib} ${builtins_opt_lib}
-      DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} ${prepare_builtins_target}
-    )
+    return()
   endif()
 
-  # Add a 'prepare' target
-  add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${libclc_builtins_lib} )
-  set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
-
-  # Also add a 'prepare' target for the triple. Since a triple may have
-  # multiple devices, ensure we only try to create the triple target once. The
-  # triple's target will build all of the bytecode for its constituent devices.
-  if( NOT TARGET prepare-${ARG_TRIPLE} )
-    add_custom_target( prepare-${ARG_TRIPLE} ALL )
-  endif()
-  add_dependencies( prepare-${ARG_TRIPLE} prepare-${obj_suffix} )
+  set( builtins_opt_lib_tgt builtins.opt.${ARG_ARCH_SUFFIX} )
 
-  install(
-    FILES ${libclc_builtins_lib}
-    DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
+  # Add opt target
+  add_custom_command( OUTPUT ${builtins_opt_lib_tgt}.bc
+    COMMAND ${opt_exe} ${ARG_OPT_FLAGS} -o ${builtins_opt_lib_tgt}.bc
+      ${builtins_link_lib}
+    DEPENDS ${opt_target} ${builtins_link_lib} ${builtins_link_lib_tgt}
+  )
+  add_custom_target( ${builtins_opt_lib_tgt}
+    ALL DEPENDS ${builtins_opt_lib_tgt}.bc
+  )
+  set_target_properties( ${builtins_opt_lib_tgt} PROPERTIES
+    TARGET_FILE ${CMAKE_CURRENT_BINARY_DIR}/${builtins_opt_lib_tgt}.bc
+    FOLDER "libclc/Device IR/Opt"
   )
 
-  # SPIR-V targets can exit early here
-  if( ARG_ARCH STREQUAL spirv OR ARG_ARCH STREQUAL spirv64 )
-    return()
-  endif()
+  set( builtins_opt_lib $<TARGET_PROPERTY:${builtins_opt_lib_tgt},TARGET_FILE> )
 
-  # Add a test for whether or not the libraries contain unresolved calls which
-  # would usually indicate a build problem. Note that we don't perform this
-  # test for all libclc targets:
-  # * nvptx-- targets don't include workitem builtins
-  # * clspv targets don't include all OpenCL builtins
+  # Add prepare target
+  set( obj_suffix ${ARG_ARCH_SUFFIX}.bc )
+  add_custom_command( OUTPUT ${obj_suffix}
+    COMMAND ${prepare_builtins_exe} -o ${obj_suffix} ${builtins_opt_lib}
+    DEPENDS ${builtins_opt_lib} ${builtins_opt_lib_tgt} ${prepare_builtins_target} )
+  add_custom_target( prepare-${obj_suffix} ALL DEPENDS ${obj_suffix} )
+  set_target_properties( "prepare-${obj_suffix}" PROPERTIES FOLDER "libclc/Device IR/Prepare" )
+
+  # nvptx-- targets don't include workitem builtins, and clspv targets don't
+  # include all OpenCL builtins
   if( NOT ARG_ARCH MATCHES "^(nvptx|clspv)(64)?$" )
     add_test( NAME external-calls-${obj_suffix}
-      COMMAND ./check_external_calls.sh ${libclc_builtins_lib} ${LLVM_TOOLS_BINARY_DIR}
+      COMMAND ./check_external_calls.sh ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} ${LLVM_TOOLS_BINARY_DIR}
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} )
   endif()
 
-  if(CMAKE_HOST_UNIX OR LLVM_USE_SYMLINKS)
-    set(LIBCLC_LINK_OR_COPY create_symlink)
-  else()
-    set(LIBCLC_LINK_OR_COPY copy)
-  endif()
-
-  foreach( a IN LISTS ARG_ALIASES )
+  install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${obj_suffix} DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
+  foreach( a ${ARG_ALIASES} )
     set( alias_suffix "${a}-${ARG_TRIPLE}.bc" )
     add_custom_command(
-      OUTPUT ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      COMMAND ${CMAKE_COMMAND} -E ${LIBCLC_LINK_OR_COPY} ${libclc_builtins_lib} ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      DEPENDS prepare-${obj_suffix}
-    )
-    add_custom_target( alias-${alias_suffix} ALL
-      DEPENDS ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-    )
-    set_target_properties( alias-${alias_suffix}
-      PROPERTIES FOLDER "libclc/Device IR/Aliases"
-    )
-    install(
-      FILES ${LIBCLC_OUTPUT_LIBRARY_DIR}/${alias_suffix}
-      DESTINATION "${CMAKE_INSTALL_DATADIR}/clc"
-    )
+      OUTPUT ${alias_suffix}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${obj_suffix} ${alias_suffix}
+      DEPENDS prepare-${obj_suffix} )
+    add_custom_target( alias-${alias_suffix} ALL DEPENDS ${alias_suffix} )
+    set_target_properties( alias-${alias_suffix} PROPERTIES FOLDER "libclc/Device IR/Aliases" )
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${alias_suffix}
+             DESTINATION "${CMAKE_INSTALL_DATADIR}/clc" )
   endforeach( a )
 endfunction(add_libclc_builtin_set)
 
diff --git a/libclc/utils/CMakeLists.txt b/libclc/utils/CMakeLists.txt
deleted file mode 100644
index ea1d9e9c8ef5..000000000000
--- a/libclc/utils/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# Construct LLVM version define
-set( LLVM_VERSION_DEFINE "-DHAVE_LLVM=0x${LLVM_VERSION_MAJOR}0${LLVM_VERSION_MINOR}" )
-
-# Setup prepare_builtins tools
-set( LLVM_LINK_COMPONENTS
-  BitReader
-  BitWriter
-  Core
-  IRReader
-  Support
-)
-
-if( LIBCLC_STANDALONE_BUILD )
-  add_llvm_executable( prepare_builtins prepare-builtins.cpp )
-  set( prepare_builtins_exe prepare_builtins )
-  set( prepare_builtins_target prepare_builtins )
-else()
-  add_llvm_utility( prepare_builtins prepare-builtins.cpp )
-  setup_host_tool( prepare_builtins PREPARE_BUILTINS prepare_builtins_exe prepare_builtins_target )
-endif()
-
-target_compile_definitions( prepare_builtins PRIVATE ${LLVM_VERSION_DEFINE} )
-# These were not properly reported in early LLVM and we don't need them
-target_compile_options( prepare_builtins PRIVATE -fno-rtti -fno-exceptions )
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake
new file mode 100644
index 000000000000..c843c02977a8
--- /dev/null
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake
@@ -0,0 +1,2 @@
+set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "assertion_semantic=observe" CACHE STRING "")
diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 17808841bd9e..1cdb3605c38a 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -39,6 +39,8 @@ modes are:
 
    Enabling hardening has no impact on the ABI.
 
+.. _notes-for-users:
+
 Notes for users
 ---------------
 
@@ -72,6 +74,10 @@ to control the level by passing **one** of the following options to the compiler
    pre-built components. Most libc++ code is header-based, so a user-provided
    value for ``_LIBCPP_HARDENING_MODE`` will be mostly respected.
 
+In some cases, users might want to override the assertion semantic used by the
+library. This can be done similarly to setting the hardening mode; please refer
+to the :ref:`relevant section <assertion-semantics>`.
+
 Notes for vendors
 -----------------
 
@@ -260,6 +266,68 @@ output. This is less secure and increases the size of the binary (among other
 things, it has to store the error message strings) but makes the failure easier
 to debug. It also allows testing the error messages in our test suite.
 
+This default behavior can be customized by users via :ref:`assertion semantics
+<assertion-semantics>`; it can also be completely overridden by vendors by
+providing a :ref:`custom assertion failure handler
+<override-assertion-handler>`.
+
+.. _assertion-semantics:
+
+Assertion semantics
+-------------------
+
+.. warning::
+
+  Assertion semantics are currently an experimental feature.
+
+.. note::
+
+  Assertion semantics are not available in the C++03 mode.
+
+What happens when an assertion fails depends on the assertion semantic being
+used. Four assertion semantics are available, based on C++26 Contracts
+evaluation semantics:
+
+- ``ignore`` evaluates the assertion but has no effect if it fails (note that it
+  differs from the Contracts ``ignore`` semantic which would not evaluate
+  the assertion at all);
+- ``observe`` logs an error (indicating, if possible on the platform, that the
+  error is fatal) but continues execution;
+- ``quick-enforce`` terminates the program as fast as possible via a trap
+  instruction. It is the default semantic for the production modes (``fast`` and
+  ``extensive``);
+- ``enforce`` logs an error and then terminates the program. It is the default
+  semantic for the ``debug`` mode.
+
+Notes:
+
+- Continuing execution after a hardening check fails results in undefined
+  behavior; the ``observe`` semantic is meant to make adopting hardening easier
+  but should not be used outside of the adoption period;
+- C++26 wording for Library Hardening precludes a conforming Hardened
+  implementation from using the Contracts ``ignore`` semantic when evaluating
+  hardened preconditions in the Library. Libc++ allows using this semantic for
+  hardened preconditions, but please be aware that using ``ignore`` does not
+  produce a conforming "Hardened" implementation, unlike the other semantics
+  above.
+
+The default assertion semantics are as follows:
+
+- ``fast``: ``quick-enforce``;
+- ``extensive``: ``quick-enforce``;
+- ``debug``: ``enforce``.
+
+The default assertion semantics can be overridden by passing **one** of the
+following options to the compiler:
+
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_IGNORE``
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_OBSERVE``
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE``
+- ``-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_ENFORCE``
+
+All the :ref:`same notes <notes-for-users>` apply to setting this macro as for
+setting ``_LIBCPP_HARDENING_MODE``.
+
 .. _override-assertion-handler:
 
 Overriding the assertion failure handler
diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst
index 6f18b61284f4..1410223d56a6 100644
--- a/libcxx/docs/ReleaseNotes/21.rst
+++ b/libcxx/docs/ReleaseNotes/21.rst
@@ -10,7 +10,7 @@ Written by the `Libc++ Team <https://libcxx.llvm.org>`_
 
 .. warning::
 
-   These are in-progress notes for the upcoming libc++ 20.0.0 release.
+   These are in-progress notes for the upcoming libc++ 21.0.0 release.
    Release notes for previous releases can be found on
    `the Download Page <https://releases.llvm.org/download.html>`_.
 
@@ -18,7 +18,7 @@ Introduction
 ============
 
 This document contains the release notes for the libc++ C++ Standard Library,
-part of the LLVM Compiler Infrastructure, release 20.0.0. Here we describe the
+part of the LLVM Compiler Infrastructure, release 21.0.0. Here we describe the
 status of libc++ in some detail, including major improvements from the previous
 release and new feature work. For the general LLVM release notes, see `the LLVM
 documentation <https://llvm.org/docs/ReleaseNotes.html>`_. All LLVM releases may
@@ -88,6 +88,12 @@ Improvements and New Features
 
 - ``ctype::tolower`` and ``ctype::toupper`` have been optimized, resulting in a 2x performance improvement.
 
+- As an experimental feature, Hardening now supports assertion semantics that allow customizing how a hardening
+  assertion failure is handled. The four available semantics, modeled on C++26 Contracts, are ``ignore``, ``observe``,
+  ``quick-enforce`` and ``enforce``. The ``observe`` semantic is intended to make it easier to adopt Hardening in
+  production but should not be used outside of this scenario. Please refer to the :ref:`Hardening documentation
+  <hardening>` for details.
+
 Deprecations and Removals
 -------------------------
 
@@ -140,6 +146,21 @@ ABI Affecting Changes
   comparison between shared libraries, since all RTTI has the correct visibility now. There is no behaviour change on
   Clang.
 
+- LLVM 20 contained an ABI break that can result in the size of ``std::unordered_{map,set,multimap,multiset}`` and ``std::deque`` changing when used with an allocator type that is empty and contains a base class that is the same across rebound allocator instantiations (e.g. ``Allocator<int>`` and ``Allocator<char>`` are both empty and contain the same base class).
+  In addition, the layout of a user-defined type that:
+
+    - contains one of the following containers: ``std::unordered_{map,set,multimap,multiset}``, ``std::deque``, ``std::map``, ``std::set``, ``std::multimap``, ``std::multiset``, ``std::list`` or ``std::vector``, and
+    - passes an empty allocator, comparator or hasher type to that container, and
+    - has a member of that same empty allocator, comparator or hasher type inside the enclosing struct, and
+    - that member is either marked with ``[[no_unique_address]]`` or optimized out via the EBO (empty base optimization) technique
+
+  saw its size increase from LLVM 19 to LLVM 20. This was caused by the usage of ``[[no_unique_address]]`` within some of libc++'s containers in a way that allowed subtle interactions with enclosing objects. This is fixed in LLVM 21 when using the Clang compiler (returning to the LLVM 19 ABI), however that implies an ABI break from LLVM 20 to LLVM 21.
+
+  Furthermore, fixing this causes a slight regression to constant evaluation support in ``std::unique_ptr``. Specifically, constant evaluation will now fail when the deleter relies on being value-initialized for constant-evaluation admissibility. If a default-initialized deleter can be used during constant evaluation, or if the default constructor is non-trivial, the ``unique_ptr`` is not affected by this regression. In particular, this regression does not impact any ``unique_ptr`` using the default deleter.
+
+  Note that there is currently no way to realistically fix this ABI break on GCC, therefore GCC will remain on the ABI introduced in LLVM 19. That also means that Clang and GCC will have a slightly different ABI for the small subset of types listed above until we are able to apply the same fix we did with Clang on GCC.
+
+  For more details see https://llvm.org/PR154146.
 
 Build System Changes
 --------------------
diff --git a/libcxx/docs/UserDocumentation.rst b/libcxx/docs/UserDocumentation.rst
index 79f5908bdc6b..415a59916837 100644
--- a/libcxx/docs/UserDocumentation.rst
+++ b/libcxx/docs/UserDocumentation.rst
@@ -72,6 +72,11 @@ when ``-fexperimental-library`` is passed:
 * ``std::chrono::tzdb`` and related time zone functionality
 * ``<syncstream>``
 
+Additionally, assertion semantics are an experimental feature that can be used
+to customize the behavior of Hardening (see :ref:`here <assertion-semantics>`).
+Assertion semantics mirror the evaluation semantics of C++26 Contracts but are
+not a standard feature.
+
 .. note::
   Experimental libraries are experimental.
     * The contents of the ``<experimental/...>`` headers and the associated static
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 4f2a8dddad92..9e9852de2a4c 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -535,6 +535,7 @@ set(files
   __locale_dir/time.h
   __locale_dir/wbuffer_convert.h
   __locale_dir/wstring_convert.h
+  __log_hardening_failure
   __math/abs.h
   __math/copysign.h
   __math/error_functions.h
@@ -873,6 +874,7 @@ set(files
   __type_traits/is_trivially_relocatable.h
   __type_traits/is_unbounded_array.h
   __type_traits/is_union.h
+  __type_traits/is_unqualified.h
   __type_traits/is_unsigned.h
   __type_traits/is_valid_expansion.h
   __type_traits/is_void.h
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index 06cb5b8ce705..8aa894e9228c 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -860,6 +860,9 @@ __sort<__less<long double>&, long double*>(long double*, long double*, __less<lo
 template <class _AlgPolicy, class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void
 __sort_dispatch(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& __comp) {
+  if (__first == __last) // log(0) is undefined, so don't try computing the depth
+    return;
+
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
   difference_type __depth_limit = 2 * std::__bit_log2(std::__to_unsigned_like(__last - __first));
 
diff --git a/libcxx/include/__assert b/libcxx/include/__assert
index 90eaa6023587..a9451daf47f2 100644
--- a/libcxx/include/__assert
+++ b/libcxx/include/__assert
@@ -20,8 +20,8 @@
 #define _LIBCPP_ASSERT(expression, message)                                                                            \
   (__builtin_expect(static_cast<bool>(expression), 1)                                                                  \
        ? (void)0                                                                                                       \
-       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(__LINE__) ": assertion " _LIBCPP_TOSTRING(            \
-             expression) " failed: " message "\n"))
+       : _LIBCPP_ASSERTION_HANDLER(__FILE__ ":" _LIBCPP_TOSTRING(                                                      \
+             __LINE__) ": libc++ Hardening assertion " _LIBCPP_TOSTRING(expression) " failed: " message "\n"))
 
 // WARNING: __builtin_assume can currently inhibit optimizations. Only add assumptions with a clear
 // optimization intent. See https://discourse.llvm.org/t/llvm-assume-blocks-optimization/71609 for a
diff --git a/libcxx/include/__bit/bit_log2.h b/libcxx/include/__bit/bit_log2.h
index 8077cd91d6fd..9ceeec1b2bc9 100644
--- a/libcxx/include/__bit/bit_log2.h
+++ b/libcxx/include/__bit/bit_log2.h
@@ -9,6 +9,7 @@
 #ifndef _LIBCPP___BIT_BIT_LOG2_H
 #define _LIBCPP___BIT_BIT_LOG2_H
 
+#include <__assert>
 #include <__bit/countl.h>
 #include <__config>
 #include <__type_traits/integer_traits.h>
@@ -23,6 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __bit_log2(_Tp __t) _NOEXCEPT {
   static_assert(__is_unsigned_integer_v<_Tp>, "__bit_log2 requires an unsigned integer type");
+  _LIBCPP_ASSERT_INTERNAL(__t != 0, "logarithm of 0 is undefined");
   return numeric_limits<_Tp>::digits - 1 - std::__countl_zero(__t);
 }
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index d940461c3023..4f9d8cd3505b 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -28,7 +28,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 210000
+#  define _LIBCPP_VERSION 210101
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
@@ -38,11 +38,47 @@
 #    define _LIBCPP_FREESTANDING
 #  endif
 
+// NOLINTNEXTLINE(libcpp-cpp-version-check)
+#  if __cplusplus < 201103L
+#    define _LIBCPP_CXX03_LANG
+#  endif
+
+#  if __has_feature(experimental_library)
+#    ifndef _LIBCPP_ENABLE_EXPERIMENTAL
+#      define _LIBCPP_ENABLE_EXPERIMENTAL
+#    endif
+#  endif
+
+// Incomplete features get their own specific disabling flags. This makes it
+// easier to grep for target specific flags once the feature is complete.
+#  if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY)
+#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1
+#  else
+#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0
+#  endif
+
+#  define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#  define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#  define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#  define _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+
 // HARDENING {
 
-// TODO: Remove in LLVM 21. We're making this an error to catch folks who might not have migrated.
-#  ifdef _LIBCPP_ENABLE_ASSERTIONS
-#    error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE instead"
+// TODO(LLVM 23): Remove this. We're making these an error to catch folks who might not have migrated.
+//       Since hardening went through several changes (many of which impacted user-facing macros),
+//       we're keeping these checks around for a bit longer than usual. Failure to properly configure
+//       hardening results in checks being dropped silently, which is a pretty big deal.
+#  if defined(_LIBCPP_ENABLE_ASSERTIONS)
+#    error "_LIBCPP_ENABLE_ASSERTIONS has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#  endif
+#  if defined(_LIBCPP_ENABLE_HARDENED_MODE)
+#    error "_LIBCPP_ENABLE_HARDENED_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#  endif
+#  if defined(_LIBCPP_ENABLE_SAFE_MODE)
+#    error "_LIBCPP_ENABLE_SAFE_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
+#  endif
+#  if defined(_LIBCPP_ENABLE_DEBUG_MODE)
+#    error "_LIBCPP_ENABLE_DEBUG_MODE has been removed, please use _LIBCPP_HARDENING_MODE=<mode> instead (see docs)"
 #  endif
 
 // The library provides the macro `_LIBCPP_HARDENING_MODE` which can be set to one of the following values:
@@ -147,16 +183,53 @@ _LIBCPP_HARDENING_MODE_EXTENSIVE, \
 _LIBCPP_HARDENING_MODE_DEBUG
 #  endif
 
+// Hardening assertion semantics generally mirror the evaluation semantics of C++26 Contracts:
+// - `ignore` evaluates the assertion but doesn't do anything if it fails (note that it differs from the Contracts
+//   `ignore` semantic which wouldn't evaluate the assertion at all);
+// - `observe` logs an error (indicating, if possible, that the error is fatal) and continues execution;
+// - `quick-enforce` terminates the program as fast as possible (via trapping);
+// - `enforce` logs an error and then terminates the program.
+//
+// Notes:
+// - Continuing execution after a hardening check fails results in undefined behavior; the `observe` semantic is meant
+//   to make adopting hardening easier but should not be used outside of this scenario;
+// - C++26 wording for Library Hardening precludes a conforming Hardened implementation from using the Contracts
+//   `ignore` semantic when evaluating hardened preconditions in the Library. Libc++ allows using this semantic for
+//   hardened preconditions, however, be aware that using `ignore` does not produce a conforming "Hardened"
+//   implementation, unlike the other semantics above.
+// clang-format off
+#  define _LIBCPP_ASSERTION_SEMANTIC_IGNORE        (1 << 1)
+#  define _LIBCPP_ASSERTION_SEMANTIC_OBSERVE       (1 << 2)
+#  define _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE (1 << 3)
+#  define _LIBCPP_ASSERTION_SEMANTIC_ENFORCE       (1 << 4)
+// clang-format on
+
+// Allow users to define an arbitrary assertion semantic; otherwise, use the default mapping from modes to semantics.
+// The default is for production-capable modes to use `quick-enforce` (i.e., trap) and for the `debug` mode to use
+// `enforce` (i.e., log and abort).
+#  ifndef _LIBCPP_ASSERTION_SEMANTIC
+
+#    if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#      define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+#    else
+#      define _LIBCPP_ASSERTION_SEMANTIC _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#    endif
+
+#  else
+#    if !_LIBCPP_HAS_EXPERIMENTAL_LIBRARY
+#      error "Assertion semantics are an experimental feature."
+#    endif
+#    if defined(_LIBCPP_CXX03_LANG)
+#      error "Assertion semantics are not available in the C++03 mode."
+#    endif
+
+#  endif // _LIBCPP_ASSERTION_SEMANTIC
+
 // } HARDENING
 
 #  define _LIBCPP_TOSTRING2(x) #x
 #  define _LIBCPP_TOSTRING(x) _LIBCPP_TOSTRING2(x)
 
-// NOLINTNEXTLINE(libcpp-cpp-version-check)
-#  if __cplusplus < 201103L
-#    define _LIBCPP_CXX03_LANG
-#  endif
-
 #  ifndef __has_constexpr_builtin
 #    define __has_constexpr_builtin(x) 0
 #  endif
@@ -190,24 +263,6 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ABI_VCRUNTIME
 #  endif
 
-#  if __has_feature(experimental_library)
-#    ifndef _LIBCPP_ENABLE_EXPERIMENTAL
-#      define _LIBCPP_ENABLE_EXPERIMENTAL
-#    endif
-#  endif
-
-// Incomplete features get their own specific disabling flags. This makes it
-// easier to grep for target specific flags once the feature is complete.
-#  if defined(_LIBCPP_ENABLE_EXPERIMENTAL) || defined(_LIBCPP_BUILDING_LIBRARY)
-#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 1
-#  else
-#    define _LIBCPP_HAS_EXPERIMENTAL_LIBRARY 0
-#  endif
-
-#  define _LIBCPP_HAS_EXPERIMENTAL_PSTL _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_TZDB _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-#  define _LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM _LIBCPP_HAS_EXPERIMENTAL_LIBRARY
-
 #  if defined(__MVS__)
 #    include <features.h> // for __NATIVE_ASCII_F
 #  endif
diff --git a/libcxx/include/__cxx03/__config b/libcxx/include/__cxx03/__config
index ef47327d9635..9b88a495055e 100644
--- a/libcxx/include/__cxx03/__config
+++ b/libcxx/include/__cxx03/__config
@@ -152,6 +152,10 @@ _LIBCPP_HARDENING_MODE_EXTENSIVE, \
 _LIBCPP_HARDENING_MODE_DEBUG
 #  endif
 
+#  ifdef _LIBCPP_ASSERTION_SEMANTIC
+#    error "Assertion semantics are not available in the C++03 mode."
+#  endif
+
 // } HARDENING
 
 #  define _LIBCPP_TOSTRING2(x) #x
diff --git a/libcxx/include/__functional/hash.h b/libcxx/include/__functional/hash.h
index 489a6f00b8a3..83bbf1b5e26c 100644
--- a/libcxx/include/__functional/hash.h
+++ b/libcxx/include/__functional/hash.h
@@ -21,6 +21,7 @@
 #include <__type_traits/is_enum.h>
 #include <__type_traits/is_floating_point.h>
 #include <__type_traits/is_integral.h>
+#include <__type_traits/is_unqualified.h>
 #include <__type_traits/underlying_type.h>
 #include <__utility/pair.h>
 #include <__utility/swap.h>
@@ -355,7 +356,8 @@ struct __hash_impl {
 };
 
 template <class _Tp>
-struct __hash_impl<_Tp, __enable_if_t<is_enum<_Tp>::value> > : __unary_function<_Tp, size_t> {
+struct __hash_impl<_Tp, __enable_if_t<is_enum<_Tp>::value && __is_unqualified_v<_Tp> > >
+    : __unary_function<_Tp, size_t> {
   _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT {
     using type = __underlying_type_t<_Tp>;
     return hash<type>()(static_cast<type>(__v));
@@ -363,17 +365,21 @@ struct __hash_impl<_Tp, __enable_if_t<is_enum<_Tp>::value> > : __unary_function<
 };
 
 template <class _Tp>
-struct __hash_impl<_Tp, __enable_if_t<is_integral<_Tp>::value && (sizeof(_Tp) <= sizeof(size_t))> >
+struct __hash_impl<
+    _Tp,
+    __enable_if_t<is_integral<_Tp>::value && __is_unqualified_v<_Tp> && (sizeof(_Tp) <= sizeof(size_t))> >
     : __unary_function<_Tp, size_t> {
   _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT { return static_cast<size_t>(__v); }
 };
 
 template <class _Tp>
-struct __hash_impl<_Tp, __enable_if_t<is_integral<_Tp>::value && (sizeof(_Tp) > sizeof(size_t))> >
+struct __hash_impl<_Tp,
+                   __enable_if_t<is_integral<_Tp>::value && __is_unqualified_v<_Tp> && (sizeof(_Tp) > sizeof(size_t))> >
     : __scalar_hash<_Tp> {};
 
 template <class _Tp>
-struct __hash_impl<_Tp, __enable_if_t<is_floating_point<_Tp>::value> > : __scalar_hash<_Tp> {
+struct __hash_impl<_Tp, __enable_if_t<is_floating_point<_Tp>::value && __is_unqualified_v<_Tp> > >
+    : __scalar_hash<_Tp> {
   _LIBCPP_HIDE_FROM_ABI size_t operator()(_Tp __v) const _NOEXCEPT {
     // -0.0 and 0.0 should return same hash
     if (__v == 0.0f)
diff --git a/libcxx/include/__log_hardening_failure b/libcxx/include/__log_hardening_failure
new file mode 100644
index 000000000000..d1805306f6b6
--- /dev/null
+++ b/libcxx/include/__log_hardening_failure
@@ -0,0 +1,42 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___LOG_HARDENING_FAILURE
+#define _LIBCPP___LOG_HARDENING_FAILURE
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+// Hardening logging is not available in the C++03 mode; moreover, it is currently only available in the experimental
+// library.
+#if _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC && !defined(_LIBCPP_CXX03_LANG)
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// This function should never be called directly from the code -- it should only be called through the
+// `_LIBCPP_LOG_HARDENING_FAILURE` macro.
+[[__gnu__::__cold__]] _LIBCPP_EXPORTED_FROM_ABI void __log_hardening_failure(const char* __message) noexcept;
+
+// _LIBCPP_LOG_HARDENING_FAILURE(message)
+//
+// This macro is used to log an error without terminating the program (as is the case for hardening failures if the
+// `observe` assertion semantic is used).
+
+#  if !defined(_LIBCPP_LOG_HARDENING_FAILURE)
+#    define _LIBCPP_LOG_HARDENING_FAILURE(__message) ::std::__log_hardening_failure(__message)
+#  endif // !defined(_LIBCPP_LOG_HARDENING_FAILURE)
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC && !defined(_LIBCPP_CXX03_LANG)
+
+#endif // _LIBCPP___LOG_HARDENING_FAILURE
diff --git a/libcxx/include/__memory/compressed_pair.h b/libcxx/include/__memory/compressed_pair.h
index fb7b7b7afcc8..29e503931b0b 100644
--- a/libcxx/include/__memory/compressed_pair.h
+++ b/libcxx/include/__memory/compressed_pair.h
@@ -80,21 +80,45 @@ class __compressed_pair_padding {
 template <class _ToPad>
 class __compressed_pair_padding<_ToPad, true> {};
 
-#  define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                  \
-    _LIBCPP_NO_UNIQUE_ADDRESS __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>))) T1 Initializer1;    \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);          \
-    _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                         \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _)
-
-#  define _LIBCPP_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3, Initializer3)                              \
-    _LIBCPP_NO_UNIQUE_ADDRESS                                                                                          \
-    __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>),                                                \
-                   __aligned__(::std::__compressed_pair_alignment<T3>))) T1 Initializer1;                              \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);          \
-    _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                         \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);          \
-    _LIBCPP_NO_UNIQUE_ADDRESS T3 Initializer3;                                                                         \
-    _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T3> _LIBCPP_CONCAT3(__padding3_, __LINE__, _)
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef _LIBCPP_COMPILER_GCC
+#    define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                \
+      _LIBCPP_NO_UNIQUE_ADDRESS __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>))) T1 Initializer1;  \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);        \
+      _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                       \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _)
+
+#    define _LIBCPP_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3, Initializer3)                            \
+      _LIBCPP_NO_UNIQUE_ADDRESS                                                                                        \
+      __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>),                                              \
+                     __aligned__(::std::__compressed_pair_alignment<T3>))) T1 Initializer1;                            \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);        \
+      _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                       \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);        \
+      _LIBCPP_NO_UNIQUE_ADDRESS T3 Initializer3;                                                                       \
+      _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T3> _LIBCPP_CONCAT3(__padding3_, __LINE__, _)
+#  else
+#    define _LIBCPP_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)                                                \
+      struct {                                                                                                         \
+        _LIBCPP_NO_UNIQUE_ADDRESS                                                                                      \
+        __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>))) T1 Initializer1;                          \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);      \
+        _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                     \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);      \
+      }
+
+#    define _LIBCPP_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3, Initializer3)                            \
+      struct {                                                                                                         \
+        _LIBCPP_NO_UNIQUE_ADDRESS                                                                                      \
+        __attribute__((__aligned__(::std::__compressed_pair_alignment<T2>),                                            \
+                       __aligned__(::std::__compressed_pair_alignment<T3>))) T1 Initializer1;                          \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T1> _LIBCPP_CONCAT3(__padding1_, __LINE__, _);      \
+        _LIBCPP_NO_UNIQUE_ADDRESS T2 Initializer2;                                                                     \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T2> _LIBCPP_CONCAT3(__padding2_, __LINE__, _);      \
+        _LIBCPP_NO_UNIQUE_ADDRESS T3 Initializer3;                                                                     \
+        _LIBCPP_NO_UNIQUE_ADDRESS ::std::__compressed_pair_padding<T3> _LIBCPP_CONCAT3(__padding3_, __LINE__, _);      \
+      }
+#  endif
 
 #else
 #  define _LIBCPP_COMPRESSED_PAIR(T1, Name1, T2, Name2)                                                                \
diff --git a/libcxx/include/__type_traits/invoke.h b/libcxx/include/__type_traits/invoke.h
index 5ff2efbe5faa..3f5626c01443 100644
--- a/libcxx/include/__type_traits/invoke.h
+++ b/libcxx/include/__type_traits/invoke.h
@@ -67,20 +67,20 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if __has_builtin(__builtin_invoke)
 
-template <class... _Args>
-using __invoke_result_t _LIBCPP_NODEBUG = decltype(__builtin_invoke(std::declval<_Args>()...));
-
 template <class, class... _Args>
 struct __invoke_result_impl {};
 
 template <class... _Args>
-struct __invoke_result_impl<__void_t<__invoke_result_t<_Args...> >, _Args...> {
-  using type _LIBCPP_NODEBUG = __invoke_result_t<_Args...>;
+struct __invoke_result_impl<__void_t<decltype(__builtin_invoke(std::declval<_Args>()...))>, _Args...> {
+  using type _LIBCPP_NODEBUG = decltype(__builtin_invoke(std::declval<_Args>()...));
 };
 
 template <class... _Args>
 using __invoke_result _LIBCPP_NODEBUG = __invoke_result_impl<void, _Args...>;
 
+template <class... _Args>
+using __invoke_result_t _LIBCPP_NODEBUG = typename __invoke_result<_Args...>::type;
+
 template <class... _Args>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR __invoke_result_t<_Args...> __invoke(_Args&&... __args)
     _NOEXCEPT_(noexcept(__builtin_invoke(std::forward<_Args>(__args)...))) {
diff --git a/libcxx/include/__type_traits/is_unqualified.h b/libcxx/include/__type_traits/is_unqualified.h
new file mode 100644
index 000000000000..7970b3611601
--- /dev/null
+++ b/libcxx/include/__type_traits/is_unqualified.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___TYPE_TRAITS_IS_UNQUALIFIED_H
+#define _LIBCPP___TYPE_TRAITS_IS_UNQUALIFIED_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Tp>
+inline const bool __is_unqualified_v = __is_same(_Tp, __remove_cvref(_Tp));
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___TYPE_TRAITS_IS_UNQUALIFIED_H
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index d6b92204f437..46815eaffa8b 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -744,7 +744,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -831,7 +831,7 @@ template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multimap<_Key, _Tp, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Key, class _Tp, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/ext/hash_set b/libcxx/include/ext/hash_set
index 7fd5df24ed3a..62a7a0dbcffb 100644
--- a/libcxx/include/ext/hash_set
+++ b/libcxx/include/ext/hash_set
@@ -458,7 +458,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI const_iterator begin() const { return __table_.begin(); }
   _LIBCPP_HIDE_FROM_ABI const_iterator end() const { return __table_.end(); }
 
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_unique(__x); }
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return __table_.__emplace_multi(__x); }
   _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator, const value_type& __x) { return insert(__x); }
   template <class _InputIterator>
   _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last);
@@ -543,7 +543,7 @@ template <class _Value, class _Hash, class _Pred, class _Alloc>
 template <class _InputIterator>
 inline void hash_multiset<_Value, _Hash, _Pred, _Alloc>::insert(_InputIterator __first, _InputIterator __last) {
   for (; __first != __last; ++__first)
-    __table_.__emplace_unique(*__first);
+    __table_.__emplace_multi(*__first);
 }
 
 template <class _Value, class _Hash, class _Pred, class _Alloc>
diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index a62c085db23d..2f4455e1bfa4 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -402,6 +402,14 @@ private:
       }
     }
   }
+
+  _LIBCPP_HIDE_FROM_ABI typename traits_type::int_type __overflow_failed() {
+    if (this->pptr() == this->epptr() + 1) {
+      this->pbump(-1); // lose the character we overflowed above -- we don't really have a
+                       // choice since we couldn't commit the contents of the put area
+    }
+    return traits_type::eof();
+  }
 };
 
 template <class _CharT, class _Traits>
@@ -842,8 +850,9 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
 
   if (__always_noconv_) {
     size_t __n = static_cast<size_t>(this->pptr() - this->pbase());
-    if (std::fwrite(this->pbase(), sizeof(char_type), __n, __file_) != __n)
-      return traits_type::eof();
+    if (std::fwrite(this->pbase(), sizeof(char_type), __n, __file_) != __n) {
+      return __overflow_failed();
+    }
   } else {
     if (!__cv_)
       std::__throw_bad_cast();
@@ -855,34 +864,38 @@ typename basic_filebuf<_CharT, _Traits>::int_type basic_filebuf<_CharT, _Traits>
     char* __extbuf_end = __extbuf_;
     do {
       codecvt_base::result __r = __cv_->out(__st_, __b, __p, __end, __extbuf_, __extbuf_ + __ebs_, __extbuf_end);
-      if (__end == __b)
-        return traits_type::eof();
+      if (__end == __b) {
+        return __overflow_failed();
+      }
 
       // No conversion needed: output characters directly to the file, done.
       if (__r == codecvt_base::noconv) {
         size_t __n = static_cast<size_t>(__p - __b);
-        if (std::fwrite(__b, 1, __n, __file_) != __n)
-          return traits_type::eof();
+        if (std::fwrite(__b, 1, __n, __file_) != __n) {
+          return __overflow_failed();
+        }
         break;
 
         // Conversion successful: output the converted characters to the file, done.
       } else if (__r == codecvt_base::ok) {
         size_t __n = static_cast<size_t>(__extbuf_end - __extbuf_);
-        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n)
-          return traits_type::eof();
+        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) {
+          return __overflow_failed();
+        }
         break;
 
         // Conversion partially successful: output converted characters to the file and repeat with the
         // remaining characters.
       } else if (__r == codecvt_base::partial) {
         size_t __n = static_cast<size_t>(__extbuf_end - __extbuf_);
-        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n)
-          return traits_type::eof();
+        if (std::fwrite(__extbuf_, 1, __n, __file_) != __n) {
+          return __overflow_failed();
+        }
         __b = const_cast<char_type*>(__end);
         continue;
 
       } else {
-        return traits_type::eof();
+        return __overflow_failed();
       }
     } while (true);
   }
diff --git a/libcxx/include/map b/libcxx/include/map
index 225156580147..3d88b32dd426 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -692,12 +692,12 @@ public:
 #  if _LIBCPP_STD_VER >= 14
   template <typename _K2>
   _LIBCPP_HIDE_FROM_ABI bool operator()(const _K2& __x, const _CP& __y) const {
-    return __comp_(__x, __y.__get_value().first);
+    return __comp_(__x, __y.first);
   }
 
   template <typename _K2>
   _LIBCPP_HIDE_FROM_ABI bool operator()(const _CP& __x, const _K2& __y) const {
-    return __comp_(__x.__get_value().first, __y);
+    return __comp_(__x.first, __y);
   }
 #  endif
 };
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 61ba1c381b2b..4509dc20c6e6 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -336,6 +336,7 @@ module std_core [system] {
       header "__type_traits/is_union.h"
       export std_core.type_traits.integral_constant
     }
+    module is_unqualified { header "__type_traits/is_unqualified.h" }
     module is_unsigned {
       header "__type_traits/is_unsigned.h"
       export std_core.type_traits.integral_constant
@@ -2353,6 +2354,9 @@ module std [system] {
     header "__std_mbstate_t.h"
     export *
   }
+  module log_hardening_failure {
+    header "__log_hardening_failure"
+  }
   module verbose_abort {
     header "__verbose_abort"
   }
diff --git a/libcxx/include/string b/libcxx/include/string
index 514dd91c7c17..d282071abf11 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -974,7 +974,12 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string()
       _NOEXCEPT_(is_nothrow_default_constructible<allocator_type>::value)
-      : __rep_() {
+#  if _LIBCPP_STD_VER >= 20 // TODO(LLVM 23): Remove this condition; this is a workaround for https://llvm.org/PR154567
+      : __rep_(__short())
+#  else
+      : __rep_()
+#  endif
+  {
     __annotate_new(0);
   }
 
@@ -984,7 +989,12 @@ public:
 #  else
       _NOEXCEPT
 #  endif
-      : __rep_(), __alloc_(__a) {
+#  if _LIBCPP_STD_VER >= 20 // TODO(LLVM 23): Remove this condition; this is a workaround for https://llvm.org/PR154567
+      : __rep_(__short()),
+#  else
+      : __rep_(),
+#  endif
+        __alloc_(__a) {
     __annotate_new(0);
   }
 
diff --git a/libcxx/lib/abi/CMakeLists.txt b/libcxx/lib/abi/CMakeLists.txt
index 7c08bd06c50b..8f277aad2dcd 100644
--- a/libcxx/lib/abi/CMakeLists.txt
+++ b/libcxx/lib/abi/CMakeLists.txt
@@ -16,6 +16,9 @@ function(cxx_abi_list_identifier result triple abi_library abi_version unstable
   elseif("${triple}" MATCHES "freebsd")
     # Ignore the major and minor versions of freebsd targets.
     string(REGEX REPLACE "freebsd[0-9]+\\.[0-9]+" "freebsd" triple "${triple}")
+  elseif("${triple}" MATCHES "aix")
+    # Ignore the V.R.M.F version string of aix targets.
+    string(REGEX REPLACE "aix[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+" "aix" triple "${triple}")
   endif()
   list(APPEND abi_properties "${triple}")
   list(APPEND abi_properties "${abi_library}")
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 97fe57a5f24f..f59fe0e08fcc 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -309,6 +309,7 @@ add_custom_target(cxx DEPENDS ${LIBCXX_BUILD_TARGETS})
 # Build the experimental static library
 set(LIBCXX_EXPERIMENTAL_SOURCES
   experimental/keep.cpp
+  experimental/log_hardening_failure.cpp
   )
 
 if (LIBCXX_PSTL_BACKEND STREQUAL "libdispatch")
diff --git a/libcxx/src/algorithm.cpp b/libcxx/src/algorithm.cpp
index d388fee5f99c..8157be6f7406 100644
--- a/libcxx/src/algorithm.cpp
+++ b/libcxx/src/algorithm.cpp
@@ -13,6 +13,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class Comp, class RandomAccessIterator>
 void __sort(RandomAccessIterator first, RandomAccessIterator last, Comp comp) {
+  if (first == last) // log(0) is undefined, so don't try computing the depth
+    return;
+
   auto depth_limit = 2 * std::__bit_log2(static_cast<size_t>(last - first));
 
   // Only use bitset partitioning for arithmetic types.  We should also check
diff --git a/libcxx/src/experimental/log_hardening_failure.cpp b/libcxx/src/experimental/log_hardening_failure.cpp
new file mode 100644
index 000000000000..f836c1545224
--- /dev/null
+++ b/libcxx/src/experimental/log_hardening_failure.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <__config>
+#include <__log_hardening_failure>
+#include <cstdio>
+
+#ifdef __BIONIC__
+#  include <syslog.h>
+#endif // __BIONIC__
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+void __log_hardening_failure(const char* message) noexcept {
+  // Always log the message to `stderr` in case the platform-specific system calls fail.
+  std::fputs(message, stderr);
+
+#if defined(__BIONIC__)
+  // Show error in logcat. The latter two arguments are ignored on Android.
+  openlog("libc++", 0, 0);
+  syslog(LOG_CRIT, "%s", message);
+  closelog();
+#endif
+}
+
+_LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
new file mode 100644
index 000000000000..ea80359f1fea
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multimap/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_map>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multimap<int, int> map;
+
+  map.insert(std::make_pair(1, 1));
+  map.insert(std::make_pair(1, 1));
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  std::pair<int, int> arr[] = {std::make_pair(1, 1), std::make_pair(1, 1)};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
new file mode 100644
index 000000000000..1a60cac158a4
--- /dev/null
+++ b/libcxx/test/extensions/gnu/hash_multiset/insert.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -Wno-deprecated
+
+// hash_multimap::insert
+
+#include <cassert>
+#include <ext/hash_set>
+
+int main(int, char**) {
+  __gnu_cxx::hash_multiset<int> map;
+
+  map.insert(1);
+  map.insert(1);
+
+  assert(map.size() == 2);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  int arr[] = {1, 1};
+
+  map.insert(arr, arr + 2);
+
+  assert(map.size() == 4);
+  assert(map.equal_range(1).first == map.begin());
+  assert(map.equal_range(1).second == map.end());
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/assertions/log_hardening_failure.pass.cpp b/libcxx/test/libcxx/assertions/log_hardening_failure.pass.cpp
new file mode 100644
index 000000000000..dda071b8d029
--- /dev/null
+++ b/libcxx/test/libcxx/assertions/log_hardening_failure.pass.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Basic smoke test for `__log_hardening_failure`.
+//
+// UNSUPPORTED: c++03
+// UNSUPPORTED: libcpp-has-no-experimental-hardening-observe-semantic
+
+#include <__log_hardening_failure>
+
+#include "test_macros.h"
+
+ASSERT_NOEXCEPT(std::__log_hardening_failure(""));
+
+int main(int, char**) {
+  std::__log_hardening_failure("Some message");
+  // It's difficult to properly test platform-specific logging behavior of the function; just make sure it exists and
+  // can be called at runtime.
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/associative/map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/map/abi.compile.pass.cpp
new file mode 100644
index 000000000000..e0598b4ff174
--- /dev/null
+++ b/libcxx/test/libcxx/containers/associative/map/abi.compile.pass.cpp
@@ -0,0 +1,168 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
+#include <cstdint>
+#include <map>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+template <class T>
+class final_small_iter_allocator final {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  final_small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  final_small_iter_allocator(final_small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; }
+  friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+template <class T, class Alloc>
+using map_alloc = std::map<T, T, std::less<T>, Alloc>;
+
+struct user_struct {
+  map_alloc<int, common_base_allocator<std::pair<const int, int> > > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 24, "");
+static_assert(sizeof(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 24, "");
+static_assert(sizeof(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 40, "");
+static_assert(sizeof(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 6, "");
+static_assert(sizeof(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 8, "");
+
+static_assert(sizeof(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 24, "");
+static_assert(sizeof(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 24, "");
+static_assert(sizeof(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 40, "");
+static_assert(sizeof(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 6, "");
+static_assert(sizeof(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 8, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 8, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::map<int, int, AlignedLess>) == 64, "");
+static_assert(TEST_ALIGNOF(std::map<int, int, AlignedLess>) == 32, "");
+
+#elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 12, "");
+static_assert(sizeof(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 12, "");
+static_assert(sizeof(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 24, "");
+static_assert(sizeof(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 6, "");
+static_assert(sizeof(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 8, "");
+
+static_assert(sizeof(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 12, "");
+static_assert(sizeof(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 12, "");
+static_assert(sizeof(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 24, "");
+static_assert(sizeof(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 6, "");
+static_assert(sizeof(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 8, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<int, std::allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, min_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, test_allocator<std::pair<const int, int> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<int, final_small_iter_allocator<std::pair<const int, int> > >) == 2, "");
+
+static_assert(TEST_ALIGNOF(map_alloc<char, std::allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, min_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, test_allocator<std::pair<const char, char> > >) == 4, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+static_assert(TEST_ALIGNOF(map_alloc<char, final_small_iter_allocator<std::pair<const char, char> > >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::map<int, int, AlignedLess>) == 64);
+static_assert(TEST_ALIGNOF(std::map<int, int, AlignedLess>) == 32);
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/associative/set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/set/abi.compile.pass.cpp
new file mode 100644
index 000000000000..14f89b6c9dea
--- /dev/null
+++ b/libcxx/test/libcxx/containers/associative/set/abi.compile.pass.cpp
@@ -0,0 +1,168 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
+#include <cstdint>
+#include <set>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+template <class T>
+class final_small_iter_allocator final {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::uint16_t;
+  using difference_type = std::int16_t;
+
+  final_small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  final_small_iter_allocator(final_small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(final_small_iter_allocator, final_small_iter_allocator) { return true; }
+  friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+template <class T, class Alloc>
+using set_alloc = std::set<T, std::less<T>, Alloc>;
+
+struct user_struct {
+  set_alloc<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(set_alloc<int, std::allocator<int> >) == 24, "");
+static_assert(sizeof(set_alloc<int, min_allocator<int> >) == 24, "");
+static_assert(sizeof(set_alloc<int, test_allocator<int> >) == 40, "");
+static_assert(sizeof(set_alloc<int, small_iter_allocator<int> >) == 6, "");
+static_assert(sizeof(set_alloc<int, final_small_iter_allocator<int> >) == 8, "");
+
+static_assert(sizeof(set_alloc<char, std::allocator<char> >) == 24, "");
+static_assert(sizeof(set_alloc<char, min_allocator<char> >) == 24, "");
+static_assert(sizeof(set_alloc<char, test_allocator<char> >) == 40, "");
+static_assert(sizeof(set_alloc<char, small_iter_allocator<char> >) == 6, "");
+static_assert(sizeof(set_alloc<char, final_small_iter_allocator<char> >) == 8, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<int, std::allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, min_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, test_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, final_small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<char, std::allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, final_small_iter_allocator<char> >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::set<int, AlignedLess>) == 64, "");
+static_assert(TEST_ALIGNOF(std::set<int, AlignedLess>) == 32, "");
+
+#elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(set_alloc<int, std::allocator<int> >) == 12, "");
+static_assert(sizeof(set_alloc<int, min_allocator<int> >) == 12, "");
+static_assert(sizeof(set_alloc<int, test_allocator<int> >) == 24, "");
+static_assert(sizeof(set_alloc<int, small_iter_allocator<int> >) == 6, "");
+static_assert(sizeof(set_alloc<int, final_small_iter_allocator<int> >) == 8, "");
+
+static_assert(sizeof(set_alloc<char, std::allocator<char> >) == 12, "");
+static_assert(sizeof(set_alloc<char, min_allocator<char> >) == 12, "");
+static_assert(sizeof(set_alloc<char, test_allocator<char> >) == 24, "");
+static_assert(sizeof(set_alloc<char, small_iter_allocator<char> >) == 6, "");
+static_assert(sizeof(set_alloc<char, final_small_iter_allocator<char> >) == 8, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<int, std::allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, min_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, test_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<int, final_small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(set_alloc<char, std::allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(set_alloc<char, final_small_iter_allocator<char> >) == 2, "");
+
+struct TEST_ALIGNAS(32) AlignedLess {};
+
+static_assert(sizeof(std::set<int, AlignedLess>) == 64);
+static_assert(TEST_ALIGNOF(std::set<int, AlignedLess>) == 32);
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
index 55d42a8d017e..60c3e5bf31b0 100644
--- a/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/associative/unord.map/abi.compile.pass.cpp
@@ -12,8 +12,6 @@
 // unordered containers changes when bounded unique_ptr is enabled.
 // UNSUPPORTED: libcpp-has-abi-bounded-unique_ptr
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cstdint>
 #include <unordered_map>
 
@@ -66,10 +64,41 @@ class final_small_iter_allocator final {
   friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
 template <class T, class Alloc>
 using unordered_map_alloc = std::unordered_map<T, T, std::hash<T>, std::equal_to<T>, Alloc>;
 
+struct user_struct {
+  unordered_map_alloc<int, common_base_allocator<std::pair<const int, int> > > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 48, "");
+#  else
+static_assert(sizeof(user_struct) == 40, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 40, "");
 static_assert(sizeof(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 40, "");
@@ -98,12 +127,22 @@ static_assert(TEST_ALIGNOF(unordered_map_alloc<char, final_small_iter_allocator<
 
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
-
-// This part of the ABI has been broken between LLVM 19 and LLVM 20.
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 64, "");
+#  else
+static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 96, "");
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 32, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 24, "");
+#  else
+static_assert(sizeof(user_struct) == 20, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(unordered_map_alloc<int, std::allocator<std::pair<const int, int> > >) == 20, "");
 static_assert(sizeof(unordered_map_alloc<int, min_allocator<std::pair<const int, int> > >) == 20, "");
@@ -133,7 +172,12 @@ static_assert(TEST_ALIGNOF(unordered_map_alloc<char, final_small_iter_allocator<
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
 
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 64);
+#  else
+static_assert(sizeof(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 96);
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_map<int, int, AlignedHash, UnalignedEqualTo>) == 32);
 
 #else
diff --git a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
index bee2012bbea2..0216cb1213ed 100644
--- a/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/associative/unord.set/abi.compile.pass.cpp
@@ -12,8 +12,6 @@
 // unordered containers changes when bounded unique_ptr is enabled.
 // UNSUPPORTED: libcpp-has-abi-bounded-unique_ptr
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cstdint>
 #include <unordered_set>
 
@@ -66,10 +64,41 @@ class final_small_iter_allocator final {
   friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
 template <class T, class Alloc>
 using unordered_set_alloc = std::unordered_set<T, std::hash<T>, std::equal_to<T>, Alloc>;
 
+struct user_struct {
+  unordered_set_alloc<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 48, "");
+#  else
+static_assert(sizeof(user_struct) == 40, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(unordered_set_alloc<int, std::allocator<int> >) == 40, "");
 static_assert(sizeof(unordered_set_alloc<int, min_allocator<int> >) == 40, "");
@@ -98,11 +127,22 @@ static_assert(TEST_ALIGNOF(unordered_set_alloc<char, final_small_iter_allocator<
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
 
-// This part of the ABI has been broken between LLVM 19 and LLVM 20.
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 64, "");
+#  else
+static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 96, "");
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 32, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 24, "");
+#  else
+static_assert(sizeof(user_struct) == 20, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(unordered_set_alloc<int, std::allocator<int> >) == 20, "");
 static_assert(sizeof(unordered_set_alloc<int, min_allocator<int> >) == 20, "");
@@ -131,7 +171,12 @@ static_assert(TEST_ALIGNOF(unordered_set_alloc<char, final_small_iter_allocator<
 struct TEST_ALIGNAS(32) AlignedHash {};
 struct UnalignedEqualTo {};
 
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
 static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 64);
+#  else
+static_assert(sizeof(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 96);
+#  endif
 static_assert(TEST_ALIGNOF(std::unordered_set<int, AlignedHash, UnalignedEqualTo>) == 32);
 
 #else
diff --git a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
index 30586d8b2422..7eaf64ea09d1 100644
--- a/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/deque/abi.compile.pass.cpp
@@ -60,6 +60,25 @@ class final_small_iter_allocator final {
   friend bool operator!=(final_small_iter_allocator, final_small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
 #if __SIZE_WIDTH__ == 64
 
 static_assert(sizeof(std::deque<int>) == 48, "");
@@ -67,24 +86,38 @@ static_assert(sizeof(std::deque<int, min_allocator<int> >) == 48, "");
 static_assert(sizeof(std::deque<int, test_allocator<int> >) == 80, "");
 static_assert(sizeof(std::deque<int, small_iter_allocator<int> >) == 12, "");
 static_assert(sizeof(std::deque<int, final_small_iter_allocator<int> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 56, "");
+#  else
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 48, "");
+#  endif
 
 static_assert(sizeof(std::deque<char>) == 48, "");
 static_assert(sizeof(std::deque<char, min_allocator<char> >) == 48, "");
 static_assert(sizeof(std::deque<char, test_allocator<char> >) == 80, "");
 static_assert(sizeof(std::deque<char, small_iter_allocator<char> >) == 12, "");
 static_assert(sizeof(std::deque<char, final_small_iter_allocator<char> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 56, "");
+#  else
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 48, "");
+#  endif
 
 static_assert(TEST_ALIGNOF(std::deque<int>) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<int, min_allocator<int> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<int, test_allocator<int> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<int, small_iter_allocator<int> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<int, final_small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<int, common_base_allocator<int> >) == 8, "");
 
 static_assert(TEST_ALIGNOF(std::deque<char>) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<char, min_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<char, test_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::deque<char, small_iter_allocator<char> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<char, final_small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<char, common_base_allocator<char> >) == 8, "");
 
 #elif __SIZE_WIDTH__ == 32
 
@@ -93,24 +126,38 @@ static_assert(sizeof(std::deque<int, min_allocator<int> >) == 24, "");
 static_assert(sizeof(std::deque<int, test_allocator<int> >) == 48, "");
 static_assert(sizeof(std::deque<int, small_iter_allocator<int> >) == 12, "");
 static_assert(sizeof(std::deque<int, final_small_iter_allocator<int> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 28, "");
+#  else
+static_assert(sizeof(std::deque<int, common_base_allocator<int> >) == 24, "");
+#  endif
 
 static_assert(sizeof(std::deque<char>) == 24, "");
 static_assert(sizeof(std::deque<char, min_allocator<char> >) == 24, "");
 static_assert(sizeof(std::deque<char, test_allocator<char> >) == 48, "");
 static_assert(sizeof(std::deque<char, small_iter_allocator<char> >) == 12, "");
 static_assert(sizeof(std::deque<char, final_small_iter_allocator<char> >) == 16, "");
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 28, "");
+#  else
+static_assert(sizeof(std::deque<char, common_base_allocator<char> >) == 24, "");
+#  endif
 
 static_assert(TEST_ALIGNOF(std::deque<int>) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<int, min_allocator<int> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<int, test_allocator<int> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<int, small_iter_allocator<int> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<int, final_small_iter_allocator<int> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<int, common_base_allocator<int> >) == 4, "");
 
 static_assert(TEST_ALIGNOF(std::deque<char>) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<char, min_allocator<char> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<char, test_allocator<char> >) == 4, "");
 static_assert(TEST_ALIGNOF(std::deque<char, small_iter_allocator<char> >) == 2, "");
 static_assert(TEST_ALIGNOF(std::deque<char, final_small_iter_allocator<char> >) == 2, "");
+static_assert(TEST_ALIGNOF(std::deque<char, common_base_allocator<char> >) == 4, "");
 
 #else
 #  error std::size_t has an unexpected size
diff --git a/libcxx/test/libcxx/containers/sequences/forwardlist/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/forwardlist/abi.compile.pass.cpp
new file mode 100644
index 000000000000..cb500f9c4d61
--- /dev/null
+++ b/libcxx/test/libcxx/containers/sequences/forwardlist/abi.compile.pass.cpp
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <forward_list>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::int16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::forward_list<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+static_assert(sizeof(user_struct) == 16, "");
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(std::forward_list<int>) == 8, "");
+static_assert(sizeof(std::forward_list<int, min_allocator<int> >) == 8, "");
+static_assert(sizeof(std::forward_list<int, test_allocator<int> >) == 24, "");
+static_assert(sizeof(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(sizeof(std::forward_list<char>) == 8, "");
+static_assert(sizeof(std::forward_list<char, min_allocator<char> >) == 8, "");
+static_assert(sizeof(std::forward_list<char, test_allocator<char> >) == 24, "");
+static_assert(sizeof(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<int>) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, min_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, test_allocator<int> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<char>) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+#elif __SIZE_WIDTH__ == 32
+static_assert(sizeof(user_struct) == 8, "");
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(std::forward_list<int>) == 4, "");
+static_assert(sizeof(std::forward_list<int, min_allocator<int> >) == 4, "");
+static_assert(sizeof(std::forward_list<int, test_allocator<int> >) == 16, "");
+static_assert(sizeof(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(sizeof(std::forward_list<char>) == 4, "");
+static_assert(sizeof(std::forward_list<char, min_allocator<char> >) == 4, "");
+static_assert(sizeof(std::forward_list<char, test_allocator<char> >) == 16, "");
+static_assert(sizeof(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<int>) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, min_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, test_allocator<int> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<int, small_iter_allocator<int> >) == 2, "");
+
+static_assert(TEST_ALIGNOF(std::forward_list<char>) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(std::forward_list<char, small_iter_allocator<char> >) == 2, "");
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
index a16ae1d52792..4b9c295c46f2 100644
--- a/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/list/abi.compile.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
 #include <cstdint>
 #include <list>
 
@@ -38,7 +40,38 @@ class small_iter_allocator {
   friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::list<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(std::list<int>) == 24, "");
 static_assert(sizeof(std::list<int, min_allocator<int> >) == 24, "");
@@ -61,6 +94,13 @@ static_assert(TEST_ALIGNOF(std::list<char, test_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::list<char, small_iter_allocator<char> >) == 2, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(std::list<int>) == 12, "");
 static_assert(sizeof(std::list<int, min_allocator<int> >) == 12, "");
diff --git a/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp
index cc6b0d94e7da..598d54ffb91a 100644
--- a/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector.bool/abi.compile.pass.cpp
@@ -40,7 +40,37 @@ class small_iter_allocator {
   friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::vector<bool, common_base_allocator<bool> > v;
+  [[no_unique_address]] common_base_allocator<bool> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(std::vector<bool>) == 24, "");
 static_assert(sizeof(std::vector<bool, min_allocator<bool> >) == 24, "");
@@ -53,6 +83,13 @@ static_assert(TEST_ALIGNOF(std::vector<bool, test_allocator<bool> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::vector<bool, small_iter_allocator<bool> >) == 2, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(std::vector<bool>) == 12, "");
 static_assert(sizeof(std::vector<bool, min_allocator<bool> >) == 12, "");
diff --git a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
index 57684951c8e8..6dc0504991b3 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/abi.compile.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// UNSUPPORTED: libcpp-abi-no-compressed-pair-padding
+
 #include <cstdint>
 #include <vector>
 
@@ -46,7 +48,38 @@ class small_iter_allocator {
   friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
 };
 
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+struct user_struct {
+  std::vector<int, common_base_allocator<int> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
 #if __SIZE_WIDTH__ == 64
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 32, "");
+#  else
+static_assert(sizeof(user_struct) == 24, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
 
 static_assert(sizeof(std::vector<int>) == 24, "");
 static_assert(sizeof(std::vector<int, min_allocator<int> >) == 24, "");
@@ -69,6 +102,13 @@ static_assert(TEST_ALIGNOF(std::vector<char, test_allocator<char> >) == 8, "");
 static_assert(TEST_ALIGNOF(std::vector<char, small_iter_allocator<char> >) == 2, "");
 
 #elif __SIZE_WIDTH__ == 32
+// TODO: Fix the ABI for GCC as well once https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121637 is fixed
+#  ifdef TEST_COMPILER_GCC
+static_assert(sizeof(user_struct) == 16, "");
+#  else
+static_assert(sizeof(user_struct) == 12, "");
+#  endif
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
 
 static_assert(sizeof(std::vector<int>) == 12, "");
 static_assert(sizeof(std::vector<int, min_allocator<int> >) == 12, "");
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/abi.compile.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/abi.compile.pass.cpp
new file mode 100644
index 000000000000..cf802e214d07
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/abi.compile.pass.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <string>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class T>
+class small_pointer {
+public:
+  using value_type        = T;
+  using difference_type   = std::int16_t;
+  using pointer           = small_pointer;
+  using reference         = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+private:
+  std::uint16_t offset;
+};
+
+template <class T>
+class small_iter_allocator {
+public:
+  using value_type      = T;
+  using pointer         = small_pointer<T>;
+  using size_type       = std::int16_t;
+  using difference_type = std::int16_t;
+
+  small_iter_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  small_iter_allocator(small_iter_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(small_iter_allocator, small_iter_allocator) { return true; }
+  friend bool operator!=(small_iter_allocator, small_iter_allocator) { return false; }
+};
+
+struct allocator_base {};
+
+// Make sure that types with a common base type don't get broken. See https://llvm.org/PR154146
+template <class T>
+struct common_base_allocator : allocator_base {
+  using value_type = T;
+
+  common_base_allocator() TEST_NOEXCEPT {}
+
+  template <class U>
+  common_base_allocator(common_base_allocator<U>) TEST_NOEXCEPT {}
+
+  T* allocate(std::size_t n);
+  void deallocate(T* p, std::size_t);
+
+  friend bool operator==(common_base_allocator, common_base_allocator) { return true; }
+  friend bool operator!=(common_base_allocator, common_base_allocator) { return false; }
+};
+
+template <class Alloc>
+using string_alloc = std::basic_string<char, std::char_traits<char>, Alloc>;
+
+struct user_struct {
+  string_alloc<common_base_allocator<char> > v;
+  [[no_unique_address]] common_base_allocator<int> a;
+};
+
+#if __SIZE_WIDTH__ == 64
+static_assert(sizeof(user_struct) == 32, "");
+static_assert(TEST_ALIGNOF(user_struct) == 8, "");
+
+static_assert(sizeof(string_alloc<std::allocator<char> >) == 24, "");
+static_assert(sizeof(string_alloc<min_allocator<char> >) == 24, "");
+static_assert(sizeof(string_alloc<test_allocator<char> >) == 32, "");
+static_assert(sizeof(string_alloc<small_iter_allocator<char> >) == 6, "");
+
+static_assert(TEST_ALIGNOF(string_alloc<std::allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(string_alloc<min_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(string_alloc<test_allocator<char> >) == 8, "");
+static_assert(TEST_ALIGNOF(string_alloc<small_iter_allocator<char> >) == 2, "");
+
+#elif __SIZE_WIDTH__ == 32
+static_assert(sizeof(user_struct) == 16, "");
+static_assert(TEST_ALIGNOF(user_struct) == 4, "");
+
+static_assert(sizeof(string_alloc<std::allocator<char> >) == 12, "");
+static_assert(sizeof(string_alloc<min_allocator<char> >) == 12, "");
+static_assert(sizeof(string_alloc<test_allocator<char> >) == 24, "");
+static_assert(sizeof(string_alloc<small_iter_allocator<char> >) == 6, "");
+
+static_assert(TEST_ALIGNOF(string_alloc<std::allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(string_alloc<min_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(string_alloc<test_allocator<char> >) == 4, "");
+static_assert(TEST_ALIGNOF(string_alloc<small_iter_allocator<char> >) == 2, "");
+
+#else
+#  error std::size_t has an unexpected size
+#endif
diff --git a/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp b/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp
index 3cf497da233f..3d97446ffe82 100644
--- a/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp
+++ b/libcxx/test/libcxx/experimental/fexperimental-library.compile.pass.cpp
@@ -29,3 +29,7 @@
 #if !_LIBCPP_HAS_EXPERIMENTAL_SYNCSTREAM
 #  error "-fexperimental-library should enable the syncstream header"
 #endif
+
+#if !_LIBCPP_HAS_EXPERIMENTAL_HARDENING_OBSERVE_SEMANTIC
+#  error "-fexperimental-library should allow using the Hardening observe semantic"
+#endif
diff --git a/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp b/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
index 419a603a037f..2bc4648878f8 100644
--- a/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
@@ -8,6 +8,8 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// Without the assertion, the test will most likely time out.
+// UNSUPPORTED: libcpp-assertion-semantic={{ignore|observe}}
 
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
diff --git a/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp b/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
index e61679554a62..30d36b5f6d7b 100644
--- a/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
@@ -18,6 +18,8 @@
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// Without the assertion, the test will most likely time out.
+// UNSUPPORTED: libcpp-assertion-semantic={{ignore|observe}}
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <latch>
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp
index 91a7db1d9a7c..1a2d080d10c3 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/value.observers.verify.cpp
@@ -124,8 +124,9 @@ void test() {
 #if _LIBCPP_HAS_EXCEPTIONS
   // expected-error-re@*:* {{call to deleted constructor of{{.*}}}}
   // expected-error-re@*:* {{call to deleted constructor of{{.*}}}}
-  // expected-error-re@*:* 1-2{{call to deleted constructor of{{.*}}}}
-  // expected-error-re@*:* 0-2{{call to deleted constructor of{{.*}}}}
 #endif
+// These diagnostics can also additionally be produced by static_assert (see GH150601).
+// expected-error-re@*:* 0-2{{call to deleted constructor of{{.*}}}}
+// expected-error-re@*:* 0-2{{call to deleted constructor of{{.*}}}}
 }
 // clang-format on
diff --git a/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp
index c7ba76517896..62491e2bc443 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/count0.pass.cpp
@@ -33,6 +33,10 @@ int main(int, char**) {
     typedef std::map<int, double, transparent_less_not_referenceable> M;
     assert(M().count(C2Int{5}) == 0);
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    assert(M().count(C2Int{5}) == 0);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp
index 75724bdb387c..57ce9339f632 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/equal_range0.pass.cpp
@@ -40,6 +40,13 @@ int main(int, char**) {
     P result = example.equal_range(C2Int{5});
     assert(result.first == result.second);
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    using P = std::pair<typename M::iterator, typename M::iterator>;
+    M example;
+    P result = example.equal_range(C2Int{5});
+    assert(result.first == result.second);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp
index 9825d6c5879b..3f09d5608772 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/find0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.find(C2Int{5}) == example.end());
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.find(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp
index fe7fe381a86e..308a2ed1e3af 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/lower_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.lower_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.lower_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp b/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp
index 525aa673ea74..332b71a84e9f 100644
--- a/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/map/map.ops/upper_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.upper_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::map<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.upper_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp
index 233d1a11e1d6..36f0ac2647ba 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/count0.pass.cpp
@@ -33,6 +33,10 @@ int main(int, char**) {
     typedef std::multimap<int, double, transparent_less_not_referenceable> M;
     assert(M().count(C2Int{5}) == 0);
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    assert(M().count(C2Int{5}) == 0);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp
index 0bead6c7938d..a362c03e2638 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/equal_range0.pass.cpp
@@ -40,6 +40,13 @@ int main(int, char**) {
     P result = example.equal_range(C2Int{5});
     assert(result.first == result.second);
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    using P = std::pair<typename M::iterator, typename M::iterator>;
+    M example;
+    P result = example.equal_range(C2Int{5});
+    assert(result.first == result.second);
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp
index 701d4e314e7a..ccb0900e7683 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/find0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.find(C2Int{5}) == example.end());
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.find(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp
index 79f994847f13..4b4853062001 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/lower_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.lower_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.lower_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp b/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp
index 62f52416b915..f2ae94577b6c 100644
--- a/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp
+++ b/libcxx/test/std/containers/associative/multimap/multimap.ops/upper_bound0.pass.cpp
@@ -36,6 +36,11 @@ int main(int, char**) {
     M example;
     assert(example.upper_bound(C2Int{5}) == example.end());
   }
+  {
+    using M = std::multimap<int, double, transparent_less_nonempty>;
+    M example;
+    assert(example.upper_bound(C2Int{5}) == example.end());
+  }
 
   return 0;
 }
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp
new file mode 100644
index 000000000000..27e06982d749
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/overflow.writefail.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-filesystem
+
+// setrlimit(RLIMIT_FSIZE) seems to only work as intended on Apple platforms
+// REQUIRES: target={{.+}}-apple-{{.+}}
+
+// <fstream>
+
+// Make sure that we properly handle the case where we try to write content to a file
+// but we fail to do so because std::fwrite fails.
+
+#include <cassert>
+#include <csignal>
+#include <cstddef>
+#include <fstream>
+#include <string>
+
+#include "platform_support.h"
+#include "test_macros.h"
+
+#if __has_include(<sys/resource.h>)
+#  include <sys/resource.h>
+void limit_file_size_to(std::size_t bytes) {
+  rlimit lim = {bytes, bytes};
+  assert(setrlimit(RLIMIT_FSIZE, &lim) == 0);
+
+  std::signal(SIGXFSZ, [](int) {}); // ignore SIGXFSZ to ensure std::fwrite fails
+}
+#else
+#  error No known way to limit the amount of filesystem space available
+#endif
+
+template <class CharT>
+void test() {
+  std::string temp = get_temp_file_name();
+  std::basic_filebuf<CharT> fbuf;
+  assert(fbuf.open(temp, std::ios::out | std::ios::trunc));
+
+  std::size_t const limit = 100000;
+  limit_file_size_to(limit);
+
+  std::basic_string<CharT> large_block(limit / 10, CharT(42));
+
+  std::streamsize ret;
+  std::size_t bytes_written = 0;
+  while ((ret = fbuf.sputn(large_block.data(), large_block.size())) != 0) {
+    bytes_written += ret;
+
+    // In theory, it's possible for an implementation to allow writing arbitrarily more bytes than
+    // set by setrlimit, but in practice if we bust 100x our limit, something else is wrong with the
+    // test and we'd end up looping forever.
+    assert(bytes_written < 100 * limit);
+  }
+
+  fbuf.close();
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/enum.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/enum.pass.cpp
index fd1fbe5f113b..05a19329f909 100644
--- a/libcxx/test/std/utilities/function.objects/unord.hash/enum.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/unord.hash/enum.pass.cpp
@@ -21,6 +21,10 @@
 
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 11
+#  include "poisoned_hash_helper.h"
+#endif
+
 enum class Colors { red, orange, yellow, green, blue, indigo, violet };
 enum class Cardinals { zero, one, two, three, five=5 };
 enum class LongColors : short { red, orange, yellow, green, blue, indigo, violet };
@@ -33,6 +37,12 @@ template <class T>
 void
 test()
 {
+#if TEST_STD_VER >= 11
+    test_hash_disabled<const T>();
+    test_hash_disabled<volatile T>();
+    test_hash_disabled<const volatile T>();
+#endif
+
     typedef std::hash<T> H;
 #if TEST_STD_VER <= 17
     static_assert((std::is_same<typename H::argument_type, T>::value), "");
diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/floating.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/floating.pass.cpp
index b8f85e193dc8..f6c31d068b23 100644
--- a/libcxx/test/std/utilities/function.objects/unord.hash/floating.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/unord.hash/floating.pass.cpp
@@ -27,10 +27,20 @@
 
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 11
+#  include "poisoned_hash_helper.h"
+#endif
+
 template <class T>
 void
 test()
 {
+#if TEST_STD_VER >= 11
+    test_hash_disabled<const T>();
+    test_hash_disabled<volatile T>();
+    test_hash_disabled<const volatile T>();
+#endif
+
     typedef std::hash<T> H;
 #if TEST_STD_VER <= 17
     static_assert((std::is_same<typename H::argument_type, T>::value), "");
diff --git a/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp b/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp
index 14af7093c1e3..c798b3aa77ea 100644
--- a/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/unord.hash/integral.pass.cpp
@@ -26,10 +26,20 @@
 
 #include "test_macros.h"
 
+#if TEST_STD_VER >= 11
+#  include "poisoned_hash_helper.h"
+#endif
+
 template <class T>
 void
 test()
 {
+#if TEST_STD_VER >= 11
+    test_hash_disabled<const T>();
+    test_hash_disabled<volatile T>();
+    test_hash_disabled<const volatile T>();
+#endif
+
     typedef std::hash<T> H;
 #if TEST_STD_VER <= 17
     static_assert((std::is_same<typename H::argument_type, T>::value), "");
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
index 09086a4c046d..8e57e8913dcb 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.comp/is_bounded_array.pass.cpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // is_bounded_array<T>
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
index 9aac871f2633..bd7da40daf2b 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/has_unique_object_representations.compile.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// The Clang version that Android currently uses in the CI is too old.
-// XFAIL: LIBCXX-ANDROID-FIXME
-
 // type_traits
 
 // has_unique_object_representations
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp
index 7f4c90922d6c..e2abfd17d27c 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/default.pass.cpp
@@ -82,6 +82,10 @@ TEST_CONSTEXPR_CXX23 bool test_basic() {
     p.get_deleter().set_state(5);
     assert(p.get_deleter().state() == 5);
   }
+// TODO: Remove this check once https://llvm.org/PR154567 is fixed
+#if TEST_STD_VER >= 23 && defined(TEST_COMPILER_CLANG)
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
     std::unique_ptr<ElemType, DefaultCtorDeleter<ElemType> > p;
     assert(p.get() == 0);
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp
index 45017a03b95d..f5e0541684d1 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/nullptr.pass.cpp
@@ -47,6 +47,10 @@ TEST_CONSTEXPR_CXX23 void test_basic() {
     assert(p.get() == 0);
     assert(p.get_deleter().state() == 0);
   }
+// TODO: Remove this check once https://llvm.org/PR154567 is fixed
+#if TEST_STD_VER >= 23 && defined(TEST_COMPILER_CLANG)
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
     std::unique_ptr<VT, DefaultCtorDeleter<VT> > p(nullptr);
     assert(p.get() == 0);
diff --git a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp
index cbce5c9c74c5..e9912d4574af 100644
--- a/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp
+++ b/libcxx/test/std/utilities/smartptr/unique.ptr/unique.ptr.class/unique.ptr.ctor/pointer.pass.cpp
@@ -75,6 +75,10 @@ TEST_CONSTEXPR_CXX23 void test_pointer() {
   }
   if (!TEST_IS_CONSTANT_EVALUATED)
     assert(A::count == 0);
+// TODO: Remove this check once https://llvm.org/PR154567 is fixed
+#if TEST_STD_VER >= 23 && defined(TEST_COMPILER_CLANG)
+  if (!TEST_IS_CONSTANT_EVALUATED)
+#endif
   {
     A* p = newValue<ValueT>(expect_alive);
     if (!TEST_IS_CONSTANT_EVALUATED)
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
index 142da1d820d9..6111138726db 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
@@ -173,6 +173,11 @@ void test_vector_bool() {
   assert(std::get<0>(v) == true);
 }
 
+struct ConvertibleFromAny {
+  template <class V>
+  ConvertibleFromAny(V) {}
+};
+
 int main(int, char**) {
   test_T_ctor_basic();
   test_T_ctor_noexcept();
@@ -180,5 +185,16 @@ int main(int, char**) {
   test_no_narrowing_check_for_class_types();
   test_construction_with_repeated_types();
   test_vector_bool();
+
+  { // Check that the constraints are evaluated lazily (see https://github.com/llvm/llvm-project/issues/151328)
+    struct Matcher {
+      Matcher() {}
+      Matcher(std::variant<ConvertibleFromAny>) {}
+    };
+
+    Matcher vec;
+    [[maybe_unused]] Matcher m = std::move(vec);
+  }
+
   return 0;
 }
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index a279400d651b..8416de76cfd6 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -44,15 +44,32 @@ static constexpr const char* Marker = "###";
 using MatchResult = std::pair<bool, std::string>;
 using Matcher     = std::function<MatchResult(const std::string& /*text*/)>;
 
-MatchResult MatchAssertionMessage(const std::string& text, std::string_view expected_message) {
+// Using the marker makes matching more precise, but we cannot output the marker when the `observe` semantic is used
+// (because it doesn't allow customizing the logging function). If the marker is not available, fall back to using less
+// precise matching by just the error message.
+MatchResult MatchAssertionMessage(const std::string& text, std::string_view expected_message, bool use_marker) {
   // Extract information from the error message. This has to stay synchronized with how we format assertions in the
   // library.
-  std::regex assertion_format(".*###\\n(.*):(\\d+): assertion (.*) failed: (.*)\\n###");
+  std::string assertion_format_string = [&] {
+    if (use_marker)
+      return (".*###\\n(.*):(\\d+): libc\\+\\+ Hardening assertion (.*) failed: (.*)\\n###");
+    return ("(.*):(\\d+): libc\\+\\+ Hardening assertion (.*) failed: (.*)\\n");
+  }();
+  std::regex assertion_format(assertion_format_string);
 
   std::smatch match_result;
-  bool has_match = std::regex_match(text, match_result, assertion_format);
-  assert(has_match);
-  assert(match_result.size() == 5);
+  // If a non-terminating assertion semantic is used, more than one assertion might be triggered before the process
+  // dies, so we cannot expect the entire target string to match.
+  bool has_match = std::regex_search(text, match_result, assertion_format);
+  if (!has_match || match_result.size() != 5) {
+    std::stringstream matching_error;
+    matching_error                                                     //
+        << "Failed to parse the assertion message.\n"                  //
+        << "Using marker:        " << use_marker << "\n"               //
+        << "Expected message:   '" << expected_message.data() << "'\n" //
+        << "Stderr contents:    '" << text.c_str() << "'\n";
+    return MatchResult(/*success=*/false, matching_error.str());
+  }
 
   const std::string& file = match_result[1];
   int line                = std::stoi(match_result[2]);
@@ -72,9 +89,9 @@ MatchResult MatchAssertionMessage(const std::string& text, std::string_view expe
   return MatchResult(/*success=*/true, /*maybe_error=*/"");
 }
 
-Matcher MakeAssertionMessageMatcher(std::string_view assertion_message) {
+Matcher MakeAssertionMessageMatcher(std::string_view assertion_message, bool use_marker = true) {
   return [=](const std::string& text) { //
-    return MatchAssertionMessage(text, assertion_message);
+    return MatchAssertionMessage(text, assertion_message, use_marker);
   };
 }
 
@@ -85,13 +102,17 @@ Matcher MakeAnyMatcher() {
 }
 
 enum class DeathCause {
-  // Valid causes
+  // Valid causes.
   VerboseAbort = 1,
   StdAbort,
   StdTerminate,
   Trap,
-  // Invalid causes
+  // Causes that might be invalid or might stem from undefined behavior (relevant for non-terminating assertion
+  // semantics).
   DidNotDie,
+  Segfault,
+  ArithmeticError,
+  // Always invalid causes.
   SetupFailure,
   Unknown
 };
@@ -108,6 +129,16 @@ bool IsValidCause(DeathCause cause) {
   }
 }
 
+bool IsTestSetupErrorCause(DeathCause cause) {
+  switch (cause) {
+  case DeathCause::SetupFailure:
+  case DeathCause::Unknown:
+    return true;
+  default:
+    return false;
+  }
+}
+
 std::string ToString(DeathCause cause) {
   switch (cause) {
   case DeathCause::VerboseAbort:
@@ -120,10 +151,14 @@ std::string ToString(DeathCause cause) {
     return "trap";
   case DeathCause::DidNotDie:
     return "<invalid cause (child did not die)>";
+  case DeathCause::Segfault:
+    return "<invalid cause (segmentation fault)>";
+  case DeathCause::ArithmeticError:
+    return "<invalid cause (fatal arithmetic error)>";
   case DeathCause::SetupFailure:
-    return "<invalid cause (child failed to set up test environment)>";
+    return "<test setup error (child failed to set up test environment)>";
   case DeathCause::Unknown:
-    return "<invalid cause (cause unknown)>";
+    return "<test setup error (test doesn't know how to interpret the death cause)>";
   }
 
   assert(false && "Unreachable");
@@ -225,9 +260,38 @@ class DeathTest {
     return DeathTestResult(Outcome::Success, cause);
   }
 
-  void PrintFailureDetails(std::string_view failure_description, std::string_view stmt, DeathCause cause) const {
-    std::fprintf(
-        stderr, "Failure: EXPECT_DEATH( %s ) failed!\n(reason: %s)\n\n", stmt.data(), failure_description.data());
+  // When non-terminating assertion semantics are used, the program will invoke UB which might or might not crash the
+  // process; we make sure that the execution produces the expected error message but otherwise consider the test run
+  // successful whether the child process dies or not.
+  template <class Func>
+  DeathTestResult RunWithoutGuaranteedDeath(Func&& func, const Matcher& matcher) {
+    std::signal(SIGABRT, [](int) { StopChildProcess(DeathCause::StdAbort); });
+    std::set_terminate([] { StopChildProcess(DeathCause::StdTerminate); });
+
+    DeathCause cause = Run(func);
+
+    if (IsTestSetupErrorCause(cause)) {
+      return DeathTestResult(Outcome::InvalidCause, cause, ToString(cause));
+    }
+
+    MatchResult match_result = matcher(GetChildStdErr());
+    if (!match_result.first) {
+      auto failure_description = std::string("Child produced a different error message\n") + match_result.second;
+      return DeathTestResult(Outcome::UnexpectedErrorMessage, cause, failure_description);
+    }
+
+    return DeathTestResult(Outcome::Success, cause);
+  }
+
+  void PrintFailureDetails(std::string_view invocation,
+                           std::string_view failure_description,
+                           std::string_view stmt,
+                           DeathCause cause) const {
+    std::fprintf(stderr,
+                 "Failure: %s( %s ) failed!\n(reason: %s)\n\n",
+                 invocation.data(),
+                 stmt.data(),
+                 failure_description.data());
 
     if (cause != DeathCause::Unknown) {
       std::fprintf(stderr, "child exit code: %d\n", GetChildExitCode());
@@ -311,10 +375,16 @@ class DeathTest {
 
     if (WIFSIGNALED(status_value)) {
       exit_code_ = WTERMSIG(status_value);
-      // `__builtin_trap` generqtes `SIGILL` on x86 and `SIGTRAP` on ARM.
+      // `__builtin_trap` generates `SIGILL` on x86 and `SIGTRAP` on ARM.
       if (exit_code_ == SIGILL || exit_code_ == SIGTRAP) {
         return DeathCause::Trap;
       }
+      if (exit_code_ == SIGSEGV) {
+        return DeathCause::Segfault;
+      }
+      if (exit_code_ == SIGFPE) {
+        return DeathCause::ArithmeticError;
+      }
     }
 
     return DeathCause::Unknown;
@@ -357,7 +427,7 @@ bool ExpectDeath(
   DeathTest test_case;
   DeathTestResult test_result = test_case.Run(expected_causes, func, matcher);
   if (!test_result.success()) {
-    test_case.PrintFailureDetails(test_result.failure_description(), stmt, test_result.cause());
+    test_case.PrintFailureDetails("EXPECT_DEATH", test_result.failure_description(), stmt, test_result.cause());
   }
 
   return test_result.success();
@@ -378,6 +448,22 @@ bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func) {
   return ExpectDeath(std::array<DeathCause, 1>{expected_cause}, stmt, func, MakeAnyMatcher());
 }
 
+template <class Func>
+bool ExpectLog(const char* stmt, Func&& func, const Matcher& matcher) {
+  DeathTest test_case;
+  DeathTestResult test_result = test_case.RunWithoutGuaranteedDeath(func, matcher);
+  if (!test_result.success()) {
+    test_case.PrintFailureDetails("EXPECT_LOG", test_result.failure_description(), stmt, test_result.cause());
+  }
+
+  return test_result.success();
+}
+
+template <class Func>
+bool ExpectLog(const char* stmt, Func&& func) {
+  return ExpectLog(stmt, func, MakeAnyMatcher());
+}
+
 // clang-format off
 
 /// Assert that the specified expression aborts with the expected cause and, optionally, error message.
@@ -392,13 +478,28 @@ bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func) {
 #define EXPECT_STD_TERMINATE(...)                 \
     assert(  ExpectDeath(DeathCause::StdTerminate, #__VA_ARGS__, __VA_ARGS__)  )
 
-#if defined(_LIBCPP_HARDENING_MODE) && _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#if defined(_LIBCPP_ASSERTION_SEMANTIC)
+
+#if _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
 #define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
     assert(( ExpectDeath(DeathCause::VerboseAbort, #expr, [&]() { (void)(expr); }, MakeAssertionMessageMatcher(message)) ))
+#elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectDeath(DeathCause::Trap,         #expr, [&]() { (void)(expr); }) ))
+#elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectLog(#expr, [&]() { (void)(expr); }, MakeAssertionMessageMatcher(message, /*use_marker=*/false)) ))
+#elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_IGNORE
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectLog(#expr, [&]() { (void)(expr); }) ))
+#else
+#error "Unknown value for _LIBCPP_ASSERTION_SEMANTIC"
+#endif // _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+
 #else
 #define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
     assert(( ExpectDeath(DeathCause::Trap,         #expr, [&]() { (void)(expr); }) ))
-#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#endif // defined(_LIBCPP_ASSERTION_SEMANTIC)
 
 // clang-format on
 
diff --git a/libcxx/test/support/is_transparent.h b/libcxx/test/support/is_transparent.h
index 700c894a8b60..4b2a458f574a 100644
--- a/libcxx/test/support/is_transparent.h
+++ b/libcxx/test/support/is_transparent.h
@@ -36,6 +36,17 @@ struct transparent_less_not_referenceable
     using is_transparent = void () const &;  // it's a type; a weird one, but a type
 };
 
+// Prevent regression when empty base class optimization is not suitable.
+// See https://github.com/llvm/llvm-project/issues/152543.
+struct transparent_less_nonempty {
+  template <class T, class U>
+  constexpr bool operator()(T&& t, U&& u) const {
+    return std::forward<T>(t) < std::forward<U>(u);
+  }
+  struct is_transparent {
+  } pad_; // making this comparator non-empty
+};
+
 struct transparent_less_no_type
 {
     template <class T, class U>
diff --git a/libcxx/test/support/test.support/test_check_assertion.pass.cpp b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
index 4dfc5319aaf9..78e47b32cdd2 100644
--- a/libcxx/test/support/test.support/test_check_assertion.pass.cpp
+++ b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
@@ -53,7 +53,7 @@ bool TestDeathTest(
   }
 
   if (!maybe_failure_description.empty()) {
-    test_case.PrintFailureDetails(maybe_failure_description, stmt, test_result.cause());
+    test_case.PrintFailureDetails("EXPECT_DEATH", maybe_failure_description, stmt, test_result.cause());
     return false;
   }
 
@@ -76,9 +76,9 @@ DeathCause assertion_death_cause = DeathCause::Trap;
 #endif
 
 int main(int, char**) {
-  auto fail_assert     = [] { _LIBCPP_ASSERT(false, "Some message"); };
-  Matcher good_matcher = MakeAssertionMessageMatcher("Some message");
-  Matcher bad_matcher  = MakeAssertionMessageMatcher("Bad expected message");
+  [[maybe_unused]] auto fail_assert = [] { _LIBCPP_ASSERT(false, "Some message"); };
+  Matcher good_matcher              = MakeAssertionMessageMatcher("Some message");
+  Matcher bad_matcher               = MakeAssertionMessageMatcher("Bad expected message");
 
   // Test the implementation of death tests. We're bypassing the assertions added by the actual `EXPECT_DEATH` macros
   // which allows us to test failure cases (where the assertion would fail) as well.
@@ -89,16 +89,22 @@ int main(int, char**) {
     // Success -- trapping.
     TEST_DEATH_TEST(Outcome::Success, DeathCause::Trap, __builtin_trap());
 
+    // `_LIBCPP_ASSERT` does not terminate the program if the `observe` semantic is used, so these tests would fail with
+    // `DidNotDie` cause.
+#if _LIBCPP_ASSERTION_SEMANTIC != _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
+
     // Success -- assertion failure with any matcher.
     TEST_DEATH_TEST_MATCHES(Outcome::Success, assertion_death_cause, MakeAnyMatcher(), fail_assert());
 
     // Success -- assertion failure with a specific matcher.
     TEST_DEATH_TEST_MATCHES(Outcome::Success, assertion_death_cause, good_matcher, fail_assert());
 
-#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
     // Failure -- error message doesn't match.
     TEST_DEATH_TEST_MATCHES(Outcome::UnexpectedErrorMessage, assertion_death_cause, bad_matcher, fail_assert());
-#endif
+#  endif
+
+#endif // _LIBCPP_ASSERTION_SEMANTIC != _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
 
     // Invalid cause -- child did not die.
     TEST_DEATH_TEST(Outcome::InvalidCause, DeathCause::DidNotDie, ((void)0));
@@ -125,7 +131,9 @@ int main(int, char**) {
     EXPECT_DEATH_MATCHES(simple_matcher, invoke_verbose_abort());
     EXPECT_STD_ABORT(invoke_abort());
     EXPECT_STD_TERMINATE([] { std::terminate(); });
+#if _LIBCPP_ASSERTION_SEMANTIC != _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
     TEST_LIBCPP_ASSERT_FAILURE(fail_assert(), "Some message");
+#endif
   }
 
   return 0;
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index d8b23be9a032..57ecf1e49dbf 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -442,6 +442,12 @@ generic-hardening-mode-extensive)
     check-runtimes
     check-abi-list
 ;;
+generic-hardening-mode-extensive-observe-semantic)
+    clean
+    generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-hardening-mode-extensive-observe-semantic.cmake"
+    check-runtimes
+    check-abi-list
+;;
 generic-hardening-mode-debug)
     clean
     generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Generic-hardening-mode-debug.cmake"
diff --git a/libcxx/utils/libcxx/sym_check/util.py b/libcxx/utils/libcxx/sym_check/util.py
index fc7ba4244ab5..dbc886f29dde 100644
--- a/libcxx/utils/libcxx/sym_check/util.py
+++ b/libcxx/utils/libcxx/sym_check/util.py
@@ -95,7 +95,7 @@ def is_xcoff_or_big_ar(filename):
     with open(filename, "rb") as f:
         magic_bytes = f.read(7)
     return (
-        magic_bytes[:4] in [b"\x01DF", b"\x01F7"]  # XCOFF32  # XCOFF64
+        magic_bytes[:2] in [b"\x01\xDF", b"\x01\xF7"]  # XCOFF32  # XCOFF64
         or magic_bytes == b"<bigaf>"
     )
 
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index adfb2a9f6950..81c613421a46 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -361,6 +361,7 @@ def getSuitableClangTidy(cfg):
             AddFeature("libcpp-has-no-incomplete-pstl"),
             AddFeature("libcpp-has-no-experimental-tzdb"),
             AddFeature("libcpp-has-no-experimental-syncstream"),
+            AddFeature("libcpp-has-no-experimental-hardening-observe-semantic"),
         ],
     ),
     # TODO: This can be improved once we use a version of GoogleBenchmark that supports the dry-run mode.
@@ -454,5 +455,24 @@ def getSuitableClangTidy(cfg):
         help="Whether to test the main or C++03-specific headers. Only changes behaviour when std=c++03.",
         actions=lambda enabled: [] if not enabled else [AddFlag("-D_LIBCPP_USE_FROZEN_CXX03_HEADERS"), AddFeature("FROZEN-CXX03-HEADERS-FIXME")],
     ),
+    Parameter(
+        name='assertion_semantic',
+        choices=["ignore", "observe", "quick_enforce", "enforce", "undefined"],
+        type=str,
+        default="undefined",
+        help="Whether to override the assertion semantic used by hardening. This is only meaningful when running the "
+        "tests against libc++ with hardening enabled. By default, no assertion semantic is specified explicitly, so "
+        "the default one will be used (depending on the hardening mode).",
+        actions=lambda assertion_semantic: filter(
+            None,
+            [
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_IGNORE")        if assertion_semantic == "ignore" else None,
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_OBSERVE")       if assertion_semantic == "observe" else None,
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE") if assertion_semantic == "quick_enforce" else None,
+                AddCompileFlag("-D_LIBCPP_ASSERTION_SEMANTIC=_LIBCPP_ASSERTION_SEMANTIC_ENFORCE")       if assertion_semantic == "enforce" else None,
+                AddFeature("libcpp-assertion-semantic={}".format(assertion_semantic))                   if assertion_semantic != "undefined" else None,
+            ],
+        ),
+    ),
 ]
 # fmt: on
diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in
index f115658f9f3c..d352405e905b 100644
--- a/libcxx/vendor/llvm/default_assertion_handler.in
+++ b/libcxx/vendor/llvm/default_assertion_handler.in
@@ -16,6 +16,7 @@
 #  include <__cxx03/__verbose_trap>
 #else
 #  include <__config>
+#  include <__log_hardening_failure>
 #  include <__verbose_abort>
 #  include <__verbose_trap>
 #endif
@@ -24,14 +25,40 @@
 #  pragma GCC system_header
 #endif
 
-#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
-#  define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+// Keep the old implementation that doesn't support assertion semantics for backward compatibility with the frozen C++03
+// mode.
+#  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+#  else
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_TRAP(message)
+#  endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 
 #else
 
-#  define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_TRAP(message)
+#  if _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_IGNORE
+#    define _LIBCPP_ASSERTION_HANDLER(message) ((void)0)
+
+#  elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_OBSERVE
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_LOG_HARDENING_FAILURE(message)
+
+#  elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_TRAP(message)
+
+#  elif _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+#    define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+
+#  else
+
+#    error _LIBCPP_ASSERTION_SEMANTIC must be set to one of the following values: \
+_LIBCPP_ASSERTION_SEMANTIC_IGNORE, \
+_LIBCPP_ASSERTION_SEMANTIC_OBSERVE, \
+_LIBCPP_ASSERTION_SEMANTIC_QUICK_ENFORCE, \
+_LIBCPP_ASSERTION_SEMANTIC_ENFORCE
+
+#  endif // _LIBCPP_ASSERTION_SEMANTIC == _LIBCPP_ASSERTION_SEMANTIC_IGNORE
 
-#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP___ASSERTION_HANDLER
diff --git a/libcxxabi/src/demangle/DemangleConfig.h b/libcxxabi/src/demangle/DemangleConfig.h
index 06fd223f5553..7904e9d1eb13 100644
--- a/libcxxabi/src/demangle/DemangleConfig.h
+++ b/libcxxabi/src/demangle/DemangleConfig.h
@@ -19,6 +19,14 @@
 #include "../abort_message.h"
 #endif
 
+#ifndef _LIBCPP_LOG_HARDENING_FAILURE
+// Libc++abi does not have any functionality to log and continue, so we drop
+// error messages when we build the demangler with `observe` assertion semantic.
+// Once the layering with libc++ is improved, this could use the libc++
+// functionality to log hardening failures.
+#define _LIBCPP_LOG_HARDENING_FAILURE(message) ((void)0)
+#endif
+
 #include <version>
 
 #ifdef _MSC_VER
diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index 55db035e6204..9a1afd3721f5 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -173,7 +173,8 @@ bool DwarfFDECache<A>::_registeredForDyldUnloads = false;
 #endif
 
 template <typename A>
-typename A::pint_t DwarfFDECache<A>::findFDE(pint_t mh, pint_t pc) {
+typename DwarfFDECache<A>::pint_t DwarfFDECache<A>::findFDE(pint_t mh,
+                                                            pint_t pc) {
   pint_t result = 0;
   _LIBUNWIND_LOG_IF_FALSE(_lock.lock_shared());
   for (entry *p = _buffer; p < _bufferUsed; ++p) {
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 79b63e5b7236..91b6e632fa7e 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -192,6 +192,18 @@ struct Configuration {
   // Used for /lldltocachepolicy=policy
   llvm::CachePruningPolicy ltoCachePolicy;
 
+  // Used for /thinlto-distributor:<path>
+  StringRef dtltoDistributor;
+
+  // Used for /thinlto-distributor-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoDistributorArgs;
+
+  // Used for /thinlto-remote-compiler:<path>
+  StringRef dtltoCompiler;
+
+  // Used for /thinlto-remote-compiler-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerArgs;
+
   // Used for /opt:[no]ltodebugpassmanager
   bool ltoDebugPassManager = false;
 
@@ -307,7 +319,7 @@ struct Configuration {
   bool warnDebugInfoUnusable = true;
   bool warnLongSectionNames = true;
   bool warnStdcallFixup = true;
-  bool warnExportedDllMain = true;
+  bool warnImportedDllMain = true;
   bool incremental = true;
   bool integrityCheck = false;
   bool killAt = false;
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index c327da28ce13..3ce8853adb2a 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -244,40 +244,36 @@ static const uint8_t thunkX64[] = {
 };
 
 static const uint8_t tailMergeX64[] = {
-    0x51,                               // push    rcx
-    0x52,                               // push    rdx
-    0x41, 0x50,                         // push    r8
-    0x41, 0x51,                         // push    r9
-    0x48, 0x83, 0xEC, 0x48,             // sub     rsp, 48h
-    0x66, 0x0F, 0x7F, 0x04, 0x24,       // movdqa  xmmword ptr [rsp], xmm0
-    0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x10, // movdqa  xmmword ptr [rsp+10h], xmm1
-    0x66, 0x0F, 0x7F, 0x54, 0x24, 0x20, // movdqa  xmmword ptr [rsp+20h], xmm2
-    0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x30, // movdqa  xmmword ptr [rsp+30h], xmm3
-    0x48, 0x8B, 0xD0,                   // mov     rdx, rax
-    0x48, 0x8D, 0x0D, 0, 0, 0, 0,       // lea     rcx, [___DELAY_IMPORT_...]
-    0xE8, 0, 0, 0, 0,                   // call    __delayLoadHelper2
-    0x66, 0x0F, 0x6F, 0x04, 0x24,       // movdqa  xmm0, xmmword ptr [rsp]
-    0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x10, // movdqa  xmm1, xmmword ptr [rsp+10h]
-    0x66, 0x0F, 0x6F, 0x54, 0x24, 0x20, // movdqa  xmm2, xmmword ptr [rsp+20h]
-    0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x30, // movdqa  xmm3, xmmword ptr [rsp+30h]
-    0x48, 0x83, 0xC4, 0x48,             // add     rsp, 48h
-    0x41, 0x59,                         // pop     r9
-    0x41, 0x58,                         // pop     r8
-    0x5A,                               // pop     rdx
-    0x59,                               // pop     rcx
-    0xFF, 0xE0,                         // jmp     rax
+    0x48, 0x89, 0x4C, 0x24, 0x08,          // mov    qword ptr [rsp+8], rcx
+    0x48, 0x89, 0x54, 0x24, 0x10,          // mov    qword ptr [rsp+10h], rdx
+    0x4C, 0x89, 0x44, 0x24, 0x18,          // mov    qword ptr [rsp+18h], r8
+    0x4C, 0x89, 0x4C, 0x24, 0x20,          // mov    qword ptr [rsp+20h], r9
+    0x48, 0x83, 0xEC, 0x68,                // sub    rsp, 68h
+    0x66, 0x0F, 0x7F, 0x44, 0x24, 0x20,    // movdqa xmmword ptr [rsp+20h], xmm0
+    0x66, 0x0F, 0x7F, 0x4C, 0x24, 0x30,    // movdqa xmmword ptr [rsp+30h], xmm1
+    0x66, 0x0F, 0x7F, 0x54, 0x24, 0x40,    // movdqa xmmword ptr [rsp+40h], xmm2
+    0x66, 0x0F, 0x7F, 0x5C, 0x24, 0x50,    // movdqa xmmword ptr [rsp+50h], xmm3
+    0x48, 0x8B, 0xD0,                      // mov    rdx, rax
+    0x48, 0x8D, 0x0D, 0, 0, 0, 0,          // lea    rcx, [___DELAY_IMPORT_...]
+    0xE8, 0, 0, 0, 0,                      // call   __delayLoadHelper2
+    0x66, 0x0F, 0x6F, 0x44, 0x24, 0x20,    // movdqa xmm0, xmmword ptr [rsp+20h]
+    0x66, 0x0F, 0x6F, 0x4C, 0x24, 0x30,    // movdqa xmm1, xmmword ptr [rsp+30h]
+    0x66, 0x0F, 0x6F, 0x54, 0x24, 0x40,    // movdqa xmm2, xmmword ptr [rsp+40h]
+    0x66, 0x0F, 0x6F, 0x5C, 0x24, 0x50,    // movdqa xmm3, xmmword ptr [rsp+50h]
+    0x48, 0x8B, 0x4C, 0x24, 0x70,          // mov    rcx, qword ptr [rsp+70h]
+    0x48, 0x8B, 0x54, 0x24, 0x78,          // mov    rdx, qword ptr [rsp+78h]
+    0x4C, 0x8B, 0x84, 0x24, 0x80, 0, 0, 0, // mov    r8, qword ptr [rsp+80h]
+    0x4C, 0x8B, 0x8C, 0x24, 0x88, 0, 0, 0, // mov    r9, qword ptr [rsp+88h]
+    0x48, 0x83, 0xC4, 0x68,                // add    rsp, 68h
+    0xFF, 0xE0,                            // jmp    rax
 };
 
 static const uint8_t tailMergeUnwindInfoX64[] = {
     0x01,       // Version=1, Flags=UNW_FLAG_NHANDLER
-    0x0a,       // Size of prolog
-    0x05,       // Count of unwind codes
+    0x18,       // Size of prolog
+    0x01,       // Count of unwind codes
     0x00,       // No frame register
-    0x0a, 0x82, // Offset 0xa: UWOP_ALLOC_SMALL(0x48)
-    0x06, 0x02, // Offset 6: UWOP_ALLOC_SMALL(8)
-    0x04, 0x02, // Offset 4: UWOP_ALLOC_SMALL(8)
-    0x02, 0x02, // Offset 2: UWOP_ALLOC_SMALL(8)
-    0x01, 0x02, // Offset 1: UWOP_ALLOC_SMALL(8)
+    0x18, 0xC2, // Offset 0x18: UWOP_ALLOC_SMALL(0x68)
     0x00, 0x00  // Padding to align on 32-bits
 };
 
@@ -378,8 +374,8 @@ class TailMergeChunkX64 : public NonSectionCodeChunk {
 
   void writeTo(uint8_t *buf) const override {
     memcpy(buf, tailMergeX64, sizeof(tailMergeX64));
-    write32le(buf + 39, desc->getRVA() - rva - 43);
-    write32le(buf + 44, helper->getRVA() - rva - 48);
+    write32le(buf + 54, desc->getRVA() - rva - 58);
+    write32le(buf + 59, helper->getRVA() - rva - 63);
   }
 
   Chunk *desc = nullptr;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 283aeed1a19c..570b8f9d0590 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -274,8 +274,13 @@ void LinkerDriver::addBuffer(std::unique_ptr<MemoryBuffer> mb,
       make<std::unique_ptr<Archive>>(std::move(file)); // take ownership
 
       int memberIndex = 0;
-      for (MemoryBufferRef m : getArchiveMembers(ctx, archive))
-        addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
+      for (MemoryBufferRef m : getArchiveMembers(ctx, archive)) {
+        if (!archive->isThin())
+          addArchiveBuffer(m, "<whole-archive>", filename, memberIndex++);
+        else
+          addThinArchiveBuffer(m, "<whole-archive>");
+      }
+
       return;
     }
     addFile(make<ArchiveFile>(ctx, mbref));
@@ -386,6 +391,14 @@ void LinkerDriver::addArchiveBuffer(MemoryBufferRef mb, StringRef symName,
   Log(ctx) << "Loaded " << obj << " for " << symName;
 }
 
+void LinkerDriver::addThinArchiveBuffer(MemoryBufferRef mb, StringRef symName) {
+  // Pass an empty string as the archive name and an offset of 0 so that
+  // the original filename is used as the buffer identifier. This is
+  // useful for DTLTO, where having the member identifier be the actual
+  // path on disk enables distribution of bitcode files during ThinLTO.
+  addArchiveBuffer(mb, symName, /*parentName=*/"", /*OffsetInArchive=*/0);
+}
+
 void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
                                         const Archive::Symbol &sym,
                                         StringRef parentName) {
@@ -422,11 +435,8 @@ void LinkerDriver::enqueueArchiveMember(const Archive::Child &c,
       reportBufferError(errorCodeToError(mbOrErr.second), childName);
     llvm::TimeTraceScope timeScope("Archive: ",
                                    mbOrErr.first->getBufferIdentifier());
-    // Pass empty string as archive name so that the original filename is
-    // used as the buffer identifier.
-    ctx.driver.addArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
-                                toCOFFString(ctx, sym), "",
-                                /*OffsetInArchive=*/0);
+    ctx.driver.addThinArchiveBuffer(takeBuffer(std::move(mbOrErr.first)),
+                                    toCOFFString(ctx, sym));
   });
 }
 
@@ -1643,8 +1653,8 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
         config->warnLocallyDefinedImported = false;
       else if (s == "longsections")
         config->warnLongSectionNames = false;
-      else if (s == "exporteddllmain")
-        config->warnExportedDllMain = false;
+      else if (s == "importeddllmain")
+        config->warnImportedDllMain = false;
       // Other warning numbers are ignored.
     }
   }
@@ -2088,6 +2098,23 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Fatal(ctx) << "/manifestinput: requires /manifest:embed";
   }
 
+  // Handle /thinlto-distributor:<path>
+  config->dtltoDistributor = args.getLastArgValue(OPT_thinlto_distributor);
+
+  // Handle /thinlto-distributor-arg:<arg>
+  for (auto *arg : args.filtered(OPT_thinlto_distributor_arg))
+    config->dtltoDistributorArgs.push_back(arg->getValue());
+
+  // Handle /thinlto-remote-compiler:<path>
+  config->dtltoCompiler = args.getLastArgValue(OPT_thinlto_compiler);
+  if (!config->dtltoDistributor.empty() && config->dtltoCompiler.empty())
+    Err(ctx) << "A value must be specified for /thinlto-remote-compiler if "
+                "/thinlto-distributor is specified.";
+
+  // Handle /thinlto-remote-compiler-arg:<arg>
+  for (auto *arg : args.filtered(OPT_thinlto_compiler_arg))
+    config->dtltoCompilerArgs.push_back(arg->getValue());
+
   // Handle /dwodir
   config->dwoDir = args.getLastArgValue(OPT_dwodir);
 
@@ -2527,28 +2554,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
             e.symbolName = symtab.mangleMaybe(e.sym);
         }
 
-        // Add weak aliases. Weak aliases is a mechanism to give remaining
-        // undefined symbols final chance to be resolved successfully.
-        for (auto pair : symtab.alternateNames) {
-          StringRef from = pair.first;
-          StringRef to = pair.second;
-          Symbol *sym = symtab.find(from);
-          if (!sym)
-            continue;
-          if (auto *u = dyn_cast<Undefined>(sym)) {
-            if (u->weakAlias) {
-              // On ARM64EC, anti-dependency aliases are treated as undefined
-              // symbols unless a demangled symbol aliases a defined one, which
-              // is part of the implementation.
-              if (!symtab.isEC() || !u->isAntiDep)
-                continue;
-              if (!isa<Undefined>(u->weakAlias) &&
-                  !isArm64ECMangledFunctionName(u->getName()))
-                continue;
-            }
-            u->setWeakAlias(symtab.addUndefined(to));
-          }
-        }
+        symtab.resolveAlternateNames();
       });
 
       ctx.forEachActiveSymtab([&](SymbolTable &symtab) {
diff --git a/lld/COFF/Driver.h b/lld/COFF/Driver.h
index 14c97a98875b..5a9bd5c6d968 100644
--- a/lld/COFF/Driver.h
+++ b/lld/COFF/Driver.h
@@ -173,6 +173,7 @@ class LinkerDriver {
                  bool lazy);
   void addArchiveBuffer(MemoryBufferRef mbref, StringRef symName,
                         StringRef parentName, uint64_t offsetInArchive);
+  void addThinArchiveBuffer(MemoryBufferRef mbref, StringRef symName);
 
   void enqueueTask(std::function<void()> task);
   bool run();
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 0b7dbea8cdd9..2a6b63cbacca 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -117,8 +117,6 @@ static coff_symbol_generic *cloneSymbol(COFFSymbolRef sym) {
 // Skip importing DllMain thunks from import libraries.
 static bool fixupDllMain(COFFLinkerContext &ctx, llvm::object::Archive *file,
                          const Archive::Symbol &sym, bool &skipDllMain) {
-  if (skipDllMain)
-    return true;
   const Archive::Child &c =
       CHECK(sym.getMember(), file->getFileName() +
                                  ": could not get the member for symbol " +
@@ -128,13 +126,13 @@ static bool fixupDllMain(COFFLinkerContext &ctx, llvm::object::Archive *file,
             file->getFileName() +
                 ": could not get the buffer for a child buffer of the archive");
   if (identify_magic(mb.getBuffer()) == file_magic::coff_import_library) {
-    if (ctx.config.warnExportedDllMain) {
+    if (ctx.config.warnImportedDllMain) {
       // We won't place DllMain symbols in the symbol table if they are
       // coming from a import library. This message can be ignored with the flag
-      // '/ignore:exporteddllmain'
+      // '/ignore:importeddllmain'
       Warn(ctx)
           << file->getFileName()
-          << ": skipping exported DllMain symbol [exporteddllmain]\nNOTE: this "
+          << ": skipping imported DllMain symbol [importeddllmain]\nNOTE: this "
              "might be a mistake when the DLL/library was produced.";
     }
     skipDllMain = true;
@@ -204,14 +202,24 @@ void ArchiveFile::parse() {
     }
   }
 
-  // Read the symbol table to construct Lazy objects.
   bool skipDllMain = false;
+  StringRef mangledDllMain, impMangledDllMain;
+
+  // The calls below will fail if we haven't set the machine type yet. Instead
+  // of failing, it is preferable to skip this "imported DllMain" check if we
+  // don't know the machine type at this point.
+  if (!file->isEmpty() && ctx.config.machine != IMAGE_FILE_MACHINE_UNKNOWN) {
+    mangledDllMain = archiveSymtab->mangle("DllMain");
+    impMangledDllMain = uniqueSaver().save("__imp_" + mangledDllMain);
+  }
+
+  // Read the symbol table to construct Lazy objects.
   for (const Archive::Symbol &sym : file->symbols()) {
-    // If the DllMain symbol was exported by mistake, skip importing it
-    // otherwise we might end up with a import thunk in the final binary which
-    // is wrong.
-    if (sym.getName() == "__imp_DllMain" || sym.getName() == "DllMain") {
-      if (fixupDllMain(ctx, file.get(), sym, skipDllMain))
+    // If an import library provides the DllMain symbol, skip importing it, as
+    // we should be using our own DllMain, not another DLL's DllMain.
+    if (!mangledDllMain.empty() && (sym.getName() == mangledDllMain ||
+                                    sym.getName() == impMangledDllMain)) {
+      if (skipDllMain || fixupDllMain(ctx, file.get(), sym, skipDllMain))
         continue;
     }
     archiveSymtab->addLazyArchive(this, sym);
diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index 2a4d07cc2d01..1050874a1b10 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -110,7 +110,16 @@ BitcodeCompiler::BitcodeCompiler(COFFLinkerContext &c) : ctx(c) {
 
   // Initialize ltoObj.
   lto::ThinBackend backend;
-  if (ctx.config.thinLTOIndexOnly) {
+  if (!ctx.config.dtltoDistributor.empty()) {
+    backend = lto::createOutOfProcessThinBackend(
+        llvm::hardware_concurrency(ctx.config.thinLTOJobs),
+        /*OnWrite=*/nullptr,
+        /*ShouldEmitIndexFiles=*/false,
+        /*ShouldEmitImportFiles=*/false, ctx.config.outputFile,
+        ctx.config.dtltoDistributor, ctx.config.dtltoDistributorArgs,
+        ctx.config.dtltoCompiler, ctx.config.dtltoCompilerArgs,
+        !ctx.config.saveTempsArgs.empty());
+  } else if (ctx.config.thinLTOIndexOnly) {
     auto OnIndexWrite = [&](StringRef S) { thinIndices.erase(S); };
     backend = lto::createWriteIndexesThinBackend(
         llvm::hardware_concurrency(ctx.config.thinLTOJobs),
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index a887d7d351e1..2a82fb5cd884 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -270,6 +270,17 @@ def thinlto_object_suffix_replace : P<
 def thinlto_prefix_replace: P<
     "thinlto-prefix-replace",
     "'old;new' replace old prefix with new prefix in ThinLTO outputs">;
+def thinlto_distributor : P<"thinlto-distributor",
+  "Distributor to use for ThinLTO backend compilations. If specified, ThinLTO "
+  "backend compilations will be distributed">;
+def thinlto_distributor_arg : P<"thinlto-distributor-arg",
+  "Arguments to pass to the ThinLTO distributor">;
+def thinlto_compiler : P<"thinlto-remote-compiler",
+  "Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
+  "compilations">;
+def thinlto_compiler_arg : P<"thinlto-remote-compiler-arg",
+  "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
+  "compilations">;
 def lto_obj_path : P<
     "lto-obj-path",
     "output native object for merged LTO unit to this path">;
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index a54ea403ba2e..94eeae279797 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -1135,9 +1135,12 @@ static pdb::BulkPublic createPublic(COFFLinkerContext &ctx, Defined *def) {
   pub.setFlags(flags);
 
   OutputSection *os = ctx.getOutputSection(def->getChunk());
-  assert(os && "all publics should be in final image");
-  pub.Offset = def->getRVA() - os->getRVA();
-  pub.Segment = os->sectionIndex;
+  assert((os || !def->getChunk()->getSize()) &&
+         "all publics should be in final image");
+  if (os) {
+    pub.Offset = def->getRVA() - os->getRVA();
+    pub.Segment = os->sectionIndex;
+  }
   return pub;
 }
 
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 0062df5820e6..d15e0c24410c 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -1344,6 +1344,44 @@ void SymbolTable::parseAlternateName(StringRef s) {
   alternateNames.insert(it, std::make_pair(from, to));
 }
 
+void SymbolTable::resolveAlternateNames() {
+  // Add weak aliases. Weak aliases is a mechanism to give remaining
+  // undefined symbols final chance to be resolved successfully.
+  for (auto pair : alternateNames) {
+    StringRef from = pair.first;
+    StringRef to = pair.second;
+    Symbol *sym = find(from);
+    if (!sym)
+      continue;
+    if (auto *u = dyn_cast<Undefined>(sym)) {
+      if (u->weakAlias) {
+        // On ARM64EC, anti-dependency aliases are treated as undefined
+        // symbols unless a demangled symbol aliases a defined one, which
+        // is part of the implementation.
+        if (!isEC() || !u->isAntiDep)
+          continue;
+        if (!isa<Undefined>(u->weakAlias) &&
+            !isArm64ECMangledFunctionName(u->getName()))
+          continue;
+      }
+
+      // Check if the destination symbol is defined. If not, skip it.
+      // It may still be resolved later if more input files are added.
+      // Also skip anti-dependency targets, as they can't be chained anyway.
+      Symbol *toSym = find(to);
+      if (!toSym)
+        continue;
+      auto toUndef = dyn_cast<Undefined>(toSym);
+      if (toUndef && (!toUndef->weakAlias || toUndef->isAntiDep))
+        continue;
+      toSym->isUsedInRegularObj = true;
+      if (toSym->isLazy())
+        forceLazy(toSym);
+      u->setWeakAlias(toSym);
+    }
+  }
+}
+
 // Parses /aligncomm option argument.
 void SymbolTable::parseAligncomm(StringRef s) {
   auto [name, align] = s.split(',');
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index 15e2644a6f51..7eb067640dc8 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -69,6 +69,9 @@ class SymbolTable {
   // symbols and warn about imported local symbols.
   void resolveRemainingUndefines();
 
+  // Try to resolve undefined symbols with alternate names.
+  void resolveAlternateNames();
+
   // Load lazy objects that are needed for MinGW automatic import and for
   // doing stdcall fixups.
   void loadMinGWSymbols();
diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index 479131a24dcf..9b33e78731c9 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -11,6 +11,7 @@
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
+#include "Thunks.h"
 #include "lld/Common/ErrorHandler.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/ELF.h"
@@ -36,6 +37,10 @@ class Hexagon final : public TargetInfo {
                      const uint8_t *loc) const override;
   RelType getDynRel(RelType type) const override;
   int64_t getImplicitAddend(const uint8_t *buf, RelType type) const override;
+  bool needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                  uint64_t branchAddr, const Symbol &s,
+                  int64_t a) const override;
+  bool inBranchRange(RelType type, uint64_t src, uint64_t dst) const override;
   void relocate(uint8_t *loc, const Relocation &rel,
                 uint64_t val) const override;
   void writePltHeader(uint8_t *buf) const override;
@@ -63,6 +68,8 @@ Hexagon::Hexagon(Ctx &ctx) : TargetInfo(ctx) {
   tlsGotRel = R_HEX_TPREL_32;
   tlsModuleIndexRel = R_HEX_DTPMOD_32;
   tlsOffsetRel = R_HEX_DTPREL_32;
+
+  needsThunks = true;
 }
 
 uint32_t Hexagon::calcEFlags() const {
@@ -258,6 +265,46 @@ static uint32_t findMaskR16(Ctx &ctx, uint32_t insn) {
 
 static void or32le(uint8_t *p, int32_t v) { write32le(p, read32le(p) | v); }
 
+bool Hexagon::inBranchRange(RelType type, uint64_t src, uint64_t dst) const {
+  int64_t offset = dst - src;
+  switch (type) {
+  case llvm::ELF::R_HEX_B22_PCREL:
+  case llvm::ELF::R_HEX_PLT_B22_PCREL:
+  case llvm::ELF::R_HEX_GD_PLT_B22_PCREL:
+  case llvm::ELF::R_HEX_LD_PLT_B22_PCREL:
+    return llvm::isInt<22>(offset >> 2);
+  case llvm::ELF::R_HEX_B15_PCREL:
+    return llvm::isInt<15>(offset >> 2);
+    break;
+  case llvm::ELF::R_HEX_B13_PCREL:
+    return llvm::isInt<13>(offset >> 2);
+    break;
+  case llvm::ELF::R_HEX_B9_PCREL:
+    return llvm::isInt<9>(offset >> 2);
+  default:
+    return true;
+  }
+  llvm_unreachable("unsupported relocation");
+}
+
+bool Hexagon::needsThunk(RelExpr expr, RelType type, const InputFile *file,
+                         uint64_t branchAddr, const Symbol &s,
+                         int64_t a) const {
+  // Only check branch range for supported branch relocation types
+  switch (type) {
+  case R_HEX_B22_PCREL:
+  case R_HEX_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
+  case R_HEX_LD_PLT_B22_PCREL:
+  case R_HEX_B15_PCREL:
+  case R_HEX_B13_PCREL:
+  case R_HEX_B9_PCREL:
+    return !ctx.target->inBranchRange(type, branchAddr, s.getVA(ctx, a));
+  default:
+    return false;
+  }
+}
+
 void Hexagon::relocate(uint8_t *loc, const Relocation &rel,
                        uint64_t val) const {
   switch (rel.type) {
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index fe804cbb0e69..8802c8c2e7f0 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -46,6 +46,8 @@ class LoongArch final : public TargetInfo {
 private:
   void tlsdescToIe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
   void tlsdescToLe(uint8_t *loc, const Relocation &rel, uint64_t val) const;
+  bool tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+                     const Relocation &rLo12, uint64_t secAddr) const;
 };
 } // end anonymous namespace
 
@@ -809,10 +811,13 @@ static void relaxPCHi20Lo12(Ctx &ctx, const InputSection &sec, size_t i,
   // address.
   // Meanwhile skip undefined, preemptible and STT_GNU_IFUNC symbols, because
   // these symbols may be resolve in runtime.
+  // Moreover, relaxation can only occur if the addends of both relocations are
+  // zero for GOT references.
   if (rHi20.type == R_LARCH_GOT_PC_HI20 &&
-      (!rHi20.sym->isDefined() || rHi20.sym->isPreemptible ||
-       rHi20.sym->isGnuIFunc() ||
-       (ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section)))
+      (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+       rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc() ||
+       (ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section) ||
+       rHi20.addend != 0 || rLo12.addend != 0))
     return;
 
   uint64_t dest = 0;
@@ -966,10 +971,16 @@ static bool relax(Ctx &ctx, InputSection &sec) {
     case R_LARCH_GOT_PC_HI20:
     case R_LARCH_TLS_GD_PC_HI20:
     case R_LARCH_TLS_LD_PC_HI20:
-    case R_LARCH_TLS_DESC_PC_HI20:
       // The overflow check for i+2 will be carried out in isPairRelaxable.
-      if (r.expr != RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC &&
-          r.expr != R_RELAX_TLS_GD_TO_LE && isPairRelaxable(relocs, i))
+      if (isPairRelaxable(relocs, i))
+        relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
+      break;
+    case R_LARCH_TLS_DESC_PC_HI20:
+      if (r.expr == RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC ||
+          r.expr == R_RELAX_TLS_GD_TO_LE) {
+        if (relaxable(relocs, i))
+          remove = 4;
+      } else if (isPairRelaxable(relocs, i))
         relaxPCHi20Lo12(ctx, sec, i, loc, r, relocs[i + 2], remove);
       break;
     case R_LARCH_CALL36:
@@ -987,6 +998,17 @@ static bool relax(Ctx &ctx, InputSection &sec) {
           isUInt<12>(r.sym->getVA(ctx, r.addend)))
         remove = 4;
       break;
+    case R_LARCH_TLS_DESC_PC_LO12:
+      if (relaxable(relocs, i) &&
+          (r.expr == RE_LOONGARCH_RELAX_TLS_GD_TO_IE_PAGE_PC ||
+           r.expr == R_RELAX_TLS_GD_TO_LE))
+        remove = 4;
+      break;
+    case R_LARCH_TLS_DESC_LD:
+      if (relaxable(relocs, i) && r.expr == R_RELAX_TLS_GD_TO_LE &&
+          isUInt<12>(r.sym->getVA(ctx, r.addend)))
+        remove = 4;
+      break;
     }
 
     // For all anchors whose offsets are <= r.offset, they are preceded by
@@ -1135,6 +1157,78 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
   }
 }
 
+// Try GOT indirection to PC relative optimization.
+// From:
+//  * pcalau12i $a0, %got_pc_hi20(sym_got)
+//  * ld.w/d    $a0, $a0, %got_pc_lo12(sym_got)
+// To:
+//  * pcalau12i $a0, %pc_hi20(sym)
+//  * addi.w/d  $a0, $a0, %pc_lo12(sym)
+//
+// Note: Althouth the optimization has been performed, the GOT entries still
+// exists, similarly to AArch64. Eliminating the entries will increase code
+// complexity.
+bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
+                              const Relocation &rLo12, uint64_t secAddr) const {
+  // Check if the relocations apply to consecutive instructions.
+  if (rHi20.offset + 4 != rLo12.offset)
+    return false;
+
+  // Check if the relocations reference the same symbol and skip undefined,
+  // preemptible and STT_GNU_IFUNC symbols.
+  if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
+      rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
+    return false;
+
+  // GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
+  // in position-independent code because these instructions produce a relative
+  // address.
+  if ((ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
+    return false;
+
+  // Check if the addends of the both relocations are zero.
+  if (rHi20.addend != 0 || rLo12.addend != 0)
+    return false;
+
+  const uint32_t currInsn = read32le(loc);
+  const uint32_t nextInsn = read32le(loc + 4);
+  const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
+  // Check if the first instruction is PCALAU12I and the second instruction is
+  // LD.
+  if ((currInsn & 0xfe000000) != PCALAU12I ||
+      (nextInsn & 0xffc00000) != ldOpcode)
+    return false;
+
+  // Check if use the same register.
+  if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
+    return false;
+
+  Symbol &sym = *rHi20.sym;
+  uint64_t symLocal = sym.getVA(ctx);
+  const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
+  // Check if the symbol address is in
+  // [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
+  const int64_t underflow = -0x80000000LL - 0x800;
+  const int64_t overflow = 0x80000000LL - 0x800;
+  if (!(displace >= underflow && displace < overflow))
+    return false;
+
+  Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset,
+                         rHi20.addend, &sym};
+  Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
+                         &sym};
+  uint64_t pageDelta =
+      getLoongArchPageDelta(symLocal, secAddr + rHi20.offset, rHi20.type);
+  // pcalau12i $a0, %pc_hi20
+  write32le(loc, insn(PCALAU12I, getD5(currInsn), 0, 0));
+  relocate(loc, newRHi20, pageDelta);
+  // addi.w/d $a0, $a0, %pc_lo12
+  write32le(loc + 4, insn(ctx.arg.is64 ? ADDI_D : ADDI_W, getD5(nextInsn),
+                          getJ5(nextInsn), 0));
+  relocate(loc + 4, newRLo12, SignExtend64(symLocal, 64));
+  return true;
+}
+
 // During TLSDESC GD_TO_IE, the converted code sequence always includes an
 // instruction related to the Lo12 relocation (ld.[wd]). To obtain correct val
 // in `getRelocTargetVA`, expr of this instruction should be adjusted to
@@ -1152,6 +1246,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
   return expr;
 }
 
+static bool pairForGotRels(ArrayRef<Relocation> relocs) {
+  // Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
+  // pairs.
+  size_t i = 0;
+  const size_t size = relocs.size();
+  for (; i != size; ++i) {
+    if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
+      if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
+        ++i;
+        continue;
+      }
+      if (relaxable(relocs, i) && i + 2 < size &&
+          relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
+        i += 2;
+        continue;
+      }
+      break;
+    } else if (relocs[i].type == R_LARCH_GOT_PC_LO12) {
+      break;
+    }
+  }
+  return i == size;
+}
+
 void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
   const unsigned bits = ctx.arg.is64 ? 64 : 32;
   uint64_t secAddr = sec.getOutputSection()->addr;
@@ -1161,6 +1279,7 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
     secAddr += ehIn->getParent()->outSecOff;
   bool isExtreme = false, isRelax = false;
   const MutableArrayRef<Relocation> relocs = sec.relocs();
+  const bool isPairForGotRels = pairForGotRels(relocs);
   for (size_t i = 0, size = relocs.size(); i != size; ++i) {
     Relocation &rel = relocs[i];
     uint8_t *loc = buf + rel.offset;
@@ -1216,6 +1335,10 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
                            bits);
         relocateNoSym(loc, rel.type, val);
       } else {
+        isRelax = relaxable(relocs, i);
+        if (isRelax && (rel.type == R_LARCH_TLS_DESC_PC_HI20 ||
+                        rel.type == R_LARCH_TLS_DESC_PC_LO12))
+          continue;
         tlsdescToIe(loc, rel, val);
       }
       continue;
@@ -1232,9 +1355,32 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
                            bits);
         relocateNoSym(loc, rel.type, val);
       } else {
+        isRelax = relaxable(relocs, i);
+        if (isRelax && (rel.type == R_LARCH_TLS_DESC_PC_HI20 ||
+                        rel.type == R_LARCH_TLS_DESC_PC_LO12 ||
+                        (rel.type == R_LARCH_TLS_DESC_LD && isUInt<12>(val))))
+          continue;
         tlsdescToLe(loc, rel, val);
       }
       continue;
+    case RE_LOONGARCH_GOT_PAGE_PC:
+      // In LoongArch, we try GOT indirection to PC relative optimization in
+      // normal or medium code model, whether or not with R_LARCH_RELAX
+      // relocation. Moreover, if the original code sequence can be relaxed to a
+      // single instruction `pcaddi`, the first instruction will be removed and
+      // it will not reach here.
+      if (isPairForGotRels && rel.type == R_LARCH_GOT_PC_HI20) {
+        bool isRelax = relaxable(relocs, i);
+        const Relocation lo12Rel = isRelax ? relocs[i + 2] : relocs[i + 1];
+        if (lo12Rel.type == R_LARCH_GOT_PC_LO12 &&
+            tryGotToPCRel(loc, rel, lo12Rel, secAddr)) {
+          // isRelax: skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12
+          // !isRelax: skip relocation R_LARCH_GOT_PC_LO12
+          i += isRelax ? 2 : 1;
+          continue;
+        }
+      }
+      break;
     default:
       break;
     }
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 71e72e7184b9..62b54f8217bc 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/LTO/LTO.h"
+#include "llvm/Object/Archive.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/Support/ARMBuildAttributes.h"
@@ -1753,6 +1754,39 @@ static uint8_t getOsAbi(const Triple &t) {
   }
 }
 
+// For DTLTO, bitcode member names must be valid paths to files on disk.
+// For thin archives, resolve `memberPath` relative to the archive's location.
+// Returns true if adjusted; false otherwise. Non-thin archives are unsupported.
+static bool dtltoAdjustMemberPathIfThinArchive(Ctx &ctx, StringRef archivePath,
+                                               std::string &memberPath) {
+  assert(!archivePath.empty());
+
+  if (ctx.arg.dtltoDistributor.empty())
+    return false;
+
+  // Read the archive header to determine if it's a thin archive.
+  auto bufferOrErr =
+      MemoryBuffer::getFileSlice(archivePath, sizeof(ThinArchiveMagic) - 1, 0);
+  if (std::error_code ec = bufferOrErr.getError()) {
+    ErrAlways(ctx) << "cannot open " << archivePath << ": " << ec.message();
+    return false;
+  }
+
+  if (!bufferOrErr->get()->getBuffer().starts_with(ThinArchiveMagic))
+    return false;
+
+  SmallString<128> resolvedPath;
+  if (path::is_relative(memberPath)) {
+    resolvedPath = path::parent_path(archivePath);
+    path::append(resolvedPath, memberPath);
+  } else
+    resolvedPath = memberPath;
+
+  path::remove_dots(resolvedPath, /*remove_dot_dot=*/true);
+  memberPath = resolvedPath.str();
+  return true;
+}
+
 BitcodeFile::BitcodeFile(Ctx &ctx, MemoryBufferRef mb, StringRef archiveName,
                          uint64_t offsetInArchive, bool lazy)
     : InputFile(ctx, BitcodeKind, mb) {
@@ -1763,17 +1797,22 @@ BitcodeFile::BitcodeFile(Ctx &ctx, MemoryBufferRef mb, StringRef archiveName,
   if (ctx.arg.thinLTOIndexOnly)
     path = replaceThinLTOSuffix(ctx, mb.getBufferIdentifier());
 
-  // ThinLTO assumes that all MemoryBufferRefs given to it have a unique
-  // name. If two archives define two members with the same name, this
-  // causes a collision which result in only one of the objects being taken
-  // into consideration at LTO time (which very likely causes undefined
-  // symbols later in the link stage). So we append file offset to make
-  // filename unique.
   StringSaver &ss = ctx.saver;
-  StringRef name = archiveName.empty()
-                       ? ss.save(path)
-                       : ss.save(archiveName + "(" + path::filename(path) +
-                                 " at " + utostr(offsetInArchive) + ")");
+  StringRef name;
+  if (archiveName.empty() ||
+      dtltoAdjustMemberPathIfThinArchive(ctx, archiveName, path)) {
+    name = ss.save(path);
+  } else {
+    // ThinLTO assumes that all MemoryBufferRefs given to it have a unique
+    // name. If two archives define two members with the same name, this
+    // causes a collision which result in only one of the objects being taken
+    // into consideration at LTO time (which very likely causes undefined
+    // symbols later in the link stage). So we append file offset to make
+    // filename unique.
+    name = ss.save(archiveName + "(" + path::filename(path) + " at " +
+                   utostr(offsetInArchive) + ")");
+  }
+
   MemoryBufferRef mbref(mb.getBuffer(), name);
 
   obj = CHECK2(lto::InputFile::create(mbref), this);
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index cebd564036b2..608cdd0d2666 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -2139,19 +2139,45 @@ void ThunkCreator::mergeThunks(ArrayRef<OutputSection *> outputSections) {
       });
 }
 
-static int64_t getPCBias(Ctx &ctx, RelType type) {
-  if (ctx.arg.emachine != EM_ARM)
-    return 0;
-  switch (type) {
-  case R_ARM_THM_JUMP19:
-  case R_ARM_THM_JUMP24:
-  case R_ARM_THM_CALL:
-    return 4;
-  default:
-    return 8;
+constexpr uint32_t HEXAGON_MASK_END_PACKET = 3 << 14;
+constexpr uint32_t HEXAGON_END_OF_PACKET = 3 << 14;
+constexpr uint32_t HEXAGON_END_OF_DUPLEX = 0 << 14;
+
+// Return the distance between the packet start and the instruction in the
+// relocation.
+static int getHexagonPacketOffset(const InputSection &isec,
+                                  const Relocation &rel) {
+  const ArrayRef<uint8_t> data = isec.content();
+
+  // Search back as many as 3 instructions.
+  for (unsigned i = 0;; i++) {
+    if (i == 3 || rel.offset < (i + 1) * 4)
+      return i * 4;
+    uint32_t instWord =
+        read32(isec.getCtx(), data.data() + (rel.offset - (i + 1) * 4));
+    if (((instWord & HEXAGON_MASK_END_PACKET) == HEXAGON_END_OF_PACKET) ||
+        ((instWord & HEXAGON_MASK_END_PACKET) == HEXAGON_END_OF_DUPLEX))
+      return i * 4;
   }
 }
 
+static int64_t getPCBias(Ctx &ctx, const InputSection &isec,
+                         const Relocation &rel) {
+  if (ctx.arg.emachine == EM_ARM) {
+    switch (rel.type) {
+    case R_ARM_THM_JUMP19:
+    case R_ARM_THM_JUMP24:
+    case R_ARM_THM_CALL:
+      return 4;
+    default:
+      return 8;
+    }
+  }
+  if (ctx.arg.emachine == EM_HEXAGON)
+    return -getHexagonPacketOffset(isec, rel);
+  return 0;
+}
+
 // Find or create a ThunkSection within the InputSectionDescription (ISD) that
 // is in range of Src. An ISD maps to a range of InputSections described by a
 // linker script section pattern such as { .text .text.* }.
@@ -2161,7 +2187,7 @@ ThunkSection *ThunkCreator::getISDThunkSec(OutputSection *os,
                                            const Relocation &rel,
                                            uint64_t src) {
   // See the comment in getThunk for -pcBias below.
-  const int64_t pcBias = getPCBias(ctx, rel.type);
+  const int64_t pcBias = getPCBias(ctx, *isec, rel);
   for (std::pair<ThunkSection *, uint32_t> tp : isd->thunkSections) {
     ThunkSection *ts = tp.first;
     uint64_t tsBase = os->addr + ts->outSecOff - pcBias;
@@ -2322,7 +2348,7 @@ std::pair<Thunk *, bool> ThunkCreator::getThunk(InputSection *isec,
   // out in the relocation addend. We compensate for the PC bias so that
   // an Arm and Thumb relocation to the same destination get the same keyAddend,
   // which is usually 0.
-  const int64_t pcBias = getPCBias(ctx, rel.type);
+  const int64_t pcBias = getPCBias(ctx, *isec, rel);
   const int64_t keyAddend = rel.addend + pcBias;
 
   // We use a ((section, offset), addend) pair to find the thunk position if
@@ -2481,7 +2507,7 @@ bool ThunkCreator::createThunks(uint32_t pass,
             // STT_SECTION + non-zero addend, clear the addend after
             // redirection.
             if (ctx.arg.emachine != EM_MIPS)
-              rel.addend = -getPCBias(ctx, rel.type);
+              rel.addend = -getPCBias(ctx, *isec, rel);
           }
 
         for (auto &p : isd->thunkSections)
@@ -2525,7 +2551,8 @@ void elf::hexagonTLSSymbolUpdate(Ctx &ctx) {
           for (Relocation &rel : isec->relocs())
             if (rel.sym->type == llvm::ELF::STT_TLS && rel.expr == R_PLT_PC) {
               if (needEntry) {
-                sym->allocateAux(ctx);
+                if (sym->auxIdx == 0)
+                  sym->allocateAux(ctx);
                 addPltEntry(ctx, *ctx.in.plt, *ctx.in.gotPlt, *ctx.in.relaPlt,
                             ctx.target->pltRel, *sym);
                 needEntry = false;
diff --git a/lld/ELF/Thunks.cpp b/lld/ELF/Thunks.cpp
index c26ba76bccb7..65d0f094c43c 100644
--- a/lld/ELF/Thunks.cpp
+++ b/lld/ELF/Thunks.cpp
@@ -415,6 +415,22 @@ class AVRThunk : public Thunk {
   void addSymbols(ThunkSection &isec) override;
 };
 
+// Hexagon CPUs need thunks for R_HEX_B{9,1{3,5},22}_PCREL,
+// R_HEX_{,GD_}PLT_B22_PCREL when their destination is out of
+// range.
+class HexagonThunk : public Thunk {
+public:
+  HexagonThunk(Ctx &ctx, const InputSection &isec, Relocation &rel,
+               Symbol &dest)
+      : Thunk(ctx, dest, 0), relOffset(rel.offset) {
+    alignment = 4;
+  }
+  uint32_t relOffset;
+  uint32_t size() override { return ctx.arg.isPic ? 12 : 8; }
+  void writeTo(uint8_t *buf) override;
+  void addSymbols(ThunkSection &isec) override;
+};
+
 // MIPS LA25 thunk
 class MipsThunk final : public Thunk {
 public:
@@ -1519,6 +1535,39 @@ bool PPC64LongBranchThunk::isCompatibleWith(const InputSection &isec,
   return rel.type == R_PPC64_REL24 || rel.type == R_PPC64_REL14;
 }
 
+// Hexagon Target Thunks
+static uint64_t getHexagonThunkDestVA(Ctx &ctx, const Symbol &s, int64_t a) {
+  uint64_t v = s.isInPlt(ctx) ? s.getPltVA(ctx) : s.getVA(ctx, a);
+  return SignExtend64<32>(v);
+}
+
+void HexagonThunk::writeTo(uint8_t *buf) {
+  uint64_t s = getHexagonThunkDestVA(ctx, destination, addend);
+  uint64_t p = getThunkTargetSym()->getVA(ctx);
+
+  if (ctx.arg.isPic) {
+    write32(ctx, buf + 0, 0x00004000); // {  immext(#0)
+    ctx.target->relocateNoSym(buf, R_HEX_B32_PCREL_X, s - p);
+    write32(ctx, buf + 4, 0x6a49c00e); //    r14 = add(pc,##0) }
+    ctx.target->relocateNoSym(buf + 4, R_HEX_6_PCREL_X, s - p);
+
+    write32(ctx, buf + 8, 0x528ec000); // {  jumpr r14 }
+  } else {
+    write32(ctx, buf + 0, 0x00004000); //  { immext
+    ctx.target->relocateNoSym(buf, R_HEX_B32_PCREL_X, s - p);
+    write32(ctx, buf + 4, 0x5800c000); //    jump <> }
+    ctx.target->relocateNoSym(buf + 4, R_HEX_B22_PCREL_X, s - p);
+  }
+}
+void HexagonThunk::addSymbols(ThunkSection &isec) {
+  Symbol *enclosing = isec.getEnclosingSymbol(relOffset);
+  StringRef src = enclosing ? enclosing->getName() : isec.name;
+
+  addSymbol(
+      saver().save("__hexagon_thunk_" + destination.getName() + "_from_" + src),
+      STT_FUNC, 0, isec);
+}
+
 Thunk::Thunk(Ctx &ctx, Symbol &d, int64_t a)
     : ctx(ctx), destination(d), addend(a), offset(0) {
   destination.thunkAccessed = true;
@@ -1692,6 +1741,24 @@ static std::unique_ptr<Thunk> addThunkAVR(Ctx &ctx, RelType type, Symbol &s,
   }
 }
 
+static std::unique_ptr<Thunk> addThunkHexagon(Ctx &ctx,
+                                              const InputSection &isec,
+                                              Relocation &rel, Symbol &s) {
+  switch (rel.type) {
+  case R_HEX_B9_PCREL:
+  case R_HEX_B13_PCREL:
+  case R_HEX_B15_PCREL:
+  case R_HEX_B22_PCREL:
+  case R_HEX_PLT_B22_PCREL:
+  case R_HEX_GD_PLT_B22_PCREL:
+    return std::make_unique<HexagonThunk>(ctx, isec, rel, s);
+  default:
+    Fatal(ctx) << "unrecognized relocation " << rel.type << " to " << &s
+               << " for hexagon target";
+    llvm_unreachable("");
+  }
+}
+
 static std::unique_ptr<Thunk> addThunkMips(Ctx &ctx, RelType type, Symbol &s) {
   if ((s.stOther & STO_MIPS_MICROMIPS) && isMipsR6(ctx))
     return std::make_unique<MicroMipsR6Thunk>(ctx, s);
@@ -1761,8 +1828,11 @@ std::unique_ptr<Thunk> elf::addThunk(Ctx &ctx, const InputSection &isec,
     return addThunkPPC32(ctx, isec, rel, s);
   case EM_PPC64:
     return addThunkPPC64(ctx, rel.type, s, a);
+  case EM_HEXAGON:
+    return addThunkHexagon(ctx, isec, rel, s);
   default:
-    llvm_unreachable("add Thunk only supported for ARM, AVR, Mips and PowerPC");
+    llvm_unreachable(
+        "add Thunk only supported for ARM, AVR, Hexagon, Mips and PowerPC");
   }
 }
 
diff --git a/lld/docs/DTLTO.rst b/lld/docs/DTLTO.rst
index 985decf6c7db..54fcc034d137 100644
--- a/lld/docs/DTLTO.rst
+++ b/lld/docs/DTLTO.rst
@@ -7,8 +7,7 @@ during the traditional link step.
 
 The implementation is documented here: https://llvm.org/docs/DTLTO.html.
 
-Currently, DTLTO is only supported in ELF LLD. Support will be added to other
-LLD flavours in the future.
+Currently, DTLTO is only supported in ELF and COFF LLD.
 
 ELF LLD
 -------
@@ -40,3 +39,37 @@ The command-line interface is as follows:
 Some LLD LTO options (e.g., ``--lto-sample-profile=<file>``) are supported.
 Currently, other options are silently accepted but do not have the intended
 effect. Support for such options will be expanded in the future.
+
+COFF LLD
+--------
+
+The command-line interface is as follows:
+
+- ``/thinlto-distributor:<path>``
+  Specifies the file to execute as the distributor process. If specified,
+  ThinLTO backend compilations will be distributed.
+
+- ``/thinlto-remote-compiler:<path>``
+  Specifies the path to the compiler that the distributor process will use for
+  backend compilations. The compiler invoked must match the version of LLD.
+
+- ``/thinlto-distributor-arg:<arg>``
+  Specifies ``<arg>`` on the command line when invoking the distributor.
+  Can be specified multiple times.
+
+- ``/thinlto-remote-compiler-arg:<arg>``
+  Appends ``<arg>`` to the remote compiler's command line.
+  Can be specified multiple times.
+
+  Options that introduce extra input/output files may cause miscompilation if
+  the distribution system does not automatically handle pushing/fetching them to
+  remote nodes. In such cases, configure the distributor - possibly using
+  ``/thinlto-distributor-arg:`` - to manage these dependencies. See the
+  distributor documentation for details.
+
+Some LLD LTO options (e.g., ``/lto-sample-profile:<file>``) are supported.
+Currently, other options are silently accepted but do not have the intended
+effect. Support for such options could be expanded in the future.
+
+Currently, there is no DTLTO command line interface supplied for ``clang-cl``,
+as users are expected to invoke LLD directly.
\ No newline at end of file
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 863b20189100..03671c6f3b6f 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -92,6 +92,9 @@ Breaking changes
 
 COFF Improvements
 -----------------
+* ``/thinlto-distributor`` and ``/thinlto-remote-compiler`` options
+  added to support Integrated Distributed ThinLTO.
+  (`#147265 <https://github.com/llvm/llvm-project/pull/147265>`_)
 
 MinGW Improvements
 ------------------
diff --git a/lld/test/COFF/alternatename-alias.s b/lld/test/COFF/alternatename-alias.s
new file mode 100644
index 000000000000..bd0a861380e9
--- /dev/null
+++ b/lld/test/COFF/alternatename-alias.s
@@ -0,0 +1,15 @@
+// REQUIRES: x86
+
+// Check that a weak alias can be used as an alternate name target.
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %s -o %t.obj
+// RUN: lld-link -dll -noentry %t.obj -alternatename:sym=altsym
+
+        .data
+        .rva sym
+
+        .weak altsym
+        .set altsym,a
+
+        .globl a
+a:
+        .word 1
diff --git a/lld/test/COFF/alternatename-antidep.s b/lld/test/COFF/alternatename-antidep.s
new file mode 100644
index 000000000000..1188a9b75d48
--- /dev/null
+++ b/lld/test/COFF/alternatename-antidep.s
@@ -0,0 +1,16 @@
+// REQUIRES: x86
+
+// Check that an anti-dependency alias can't be used as an alternate name target.
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %s -o %t.obj
+// RUN: not lld-link -dll -noentry %t.obj -alternatename:sym=altsym 2>&1 | FileCheck %s
+// CHECK: error: undefined symbol: sym
+
+        .data
+        .rva sym
+
+        .weak_anti_dep altsym
+        .set altsym,a
+
+        .globl a
+a:
+        .word 1
diff --git a/lld/test/COFF/alternatename-lib.s b/lld/test/COFF/alternatename-lib.s
new file mode 100644
index 000000000000..206fe6bc2397
--- /dev/null
+++ b/lld/test/COFF/alternatename-lib.s
@@ -0,0 +1,43 @@
+// REQUIRES: x86
+// RUN: split-file %s %t.dir && cd %t.dir
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows refab.s -o refab.obj
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows aa.s -o aa.obj
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows b.s -o b.obj
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows antidep.s -o antidep.obj
+// RUN: llvm-lib -out:aa.lib aa.obj
+// RUN: llvm-lib -out:b.lib b.obj
+
+// Check that -alternatename with an undefined target does not prevent the symbol from being resolved to a library,
+// once another alternate name is resolved and pulls in the source symbol.
+// RUN: lld-link -out:out.dll -dll -noentry -machine:amd64 refab.obj aa.lib -alternatename:a=aa -alternatename:b=undef
+
+// Check that -alternatename with an anti-dependency target does not prevent the symbol from being resolved to a library,
+// after another alternate name is resolved and pulls in the source symbol.
+// RUN: lld-link -out:out2.dll -dll -noentry -machine:amd64 antidep.obj refab.obj aa.lib -alternatename:a=aa -alternatename:b=u
+
+#--- refab.s
+        .data
+        .rva a
+        .rva b
+
+#--- aa.s
+        .globl aa
+aa:
+        .word 1
+
+        .section .drectve, "yn"
+        .ascii "/defaultlib:b.lib"
+
+#--- b.s
+        .globl b
+b:
+        .word 2
+
+#--- antidep.s
+        .weak_anti_dep u
+        .set u,d
+
+        .globl d
+d:
+        .word 3
diff --git a/lld/test/COFF/alternatename-lto.ll b/lld/test/COFF/alternatename-lto.ll
new file mode 100644
index 000000000000..c3666cd3501d
--- /dev/null
+++ b/lld/test/COFF/alternatename-lto.ll
@@ -0,0 +1,25 @@
+; REQUIRES: x86
+; RUN: mkdir -p %t.dir
+; RUN: llvm-as -o %t.obj %s
+; RUN: lld-link -out:%t.dll -dll -noentry %t.obj -export:test
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc19.33.0"
+
+$alt = comdat any
+
+@alt = weak_odr dso_local global i32 0, comdat, align 4
+@ext = external dso_local global i32, align 4
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @test() #0 {
+entry:
+  %0 = load i32, ptr @ext, align 4
+  ret i32 %0
+}
+
+attributes #0 = { noinline nounwind optnone uwtable }
+
+!llvm.linker.options = !{!0}
+
+!0 = !{!"/alternatename:ext=alt"}
diff --git a/lld/test/COFF/arm64ec-altnames.s b/lld/test/COFF/arm64ec-altnames.s
index b2abb24efe4c..cca778ab8dc6 100644
--- a/lld/test/COFF/arm64ec-altnames.s
+++ b/lld/test/COFF/arm64ec-altnames.s
@@ -2,6 +2,7 @@ REQUIRES: aarch64
 RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ext.s -o ext.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows ext-mangled.s -o ext-mangled.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows impl.s -o impl.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows impl-cpp.s -o impl-cpp.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig.obj
@@ -49,6 +50,20 @@ RUN: lld-link -machine:arm64ec -dll -noentry -out:out4.dll impl-cpp.obj loadconf
 RUN: llvm-objdump -d out4.dll | FileCheck --check-prefix=DISASM %s
 RUN: llvm-readobj --hex-dump=.test out4.dll | FileCheck --check-prefix=TESTSEC %s
 
+# Check that when both mangled and demangled alternate names are used,
+# only the one whose target is defined is used (the mangled one in this case).
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out5.dll ext-mangled.obj loadconfig.obj "-alternatename:#func=#altsym" -alternatename:func=altsym
+RUN: llvm-objdump -d out5.dll | FileCheck --check-prefix=DISASM %s
+RUN: llvm-readobj --hex-dump=.test out5.dll | FileCheck --check-prefix=TESTSEC %s
+
+# Check that when both mangled and demangled alternate names are used,
+# only the one whose target is defined is used (the demangled one in this case).
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out6.dll ext.obj loadconfig.obj "-alternatename:#func=#altsym" -alternatename:func=altsym
+RUN: llvm-objdump -d out6.dll | FileCheck --check-prefix=DISASM2 %s
+RUN: llvm-readobj --hex-dump=.test out6.dll | FileCheck --check-prefix=TESTSEC2 %s
+
 #--- ext.s
         .weak_anti_dep func
 .set func, "#func"
@@ -70,6 +85,30 @@ altsym:
         mov w0, #1
         ret
 
+#--- ext-mangled.s
+        .weak_anti_dep func
+.set func, "#func"
+        .weak_anti_dep "#func"
+.set "#func", thunksym
+
+        .section .test, "r"
+        .rva func
+        .rva "#func"
+
+        .section .thnk,"xr",discard,thunksym
+thunksym:
+        mov w0, #2
+        ret
+
+        .section .text,"xr",discard,"#altsym"
+        .globl "#altsym"
+"#altsym":
+        mov w0, #1
+        ret
+
+        .weak_anti_dep altsym
+        .set altsym,"#altsym"
+
 #--- impl.s
         .weak_anti_dep func
 .set func, "#func"
diff --git a/lld/test/COFF/arm64ec-delayimport.test b/lld/test/COFF/arm64ec-delayimport.test
index 1e0bd899ba32..01d4ab89982e 100644
--- a/lld/test/COFF/arm64ec-delayimport.test
+++ b/lld/test/COFF/arm64ec-delayimport.test
@@ -51,28 +51,28 @@ DISASM-NEXT: 180002016: 48 8d 05 6b 50 00 00         leaq    0x506b(%rip), %rax
 DISASM-NEXT: 18000201d: e9 0c 00 00 00               jmp     0x18000202e <.text+0x102e>
 DISASM-NEXT: 180002022: 48 8d 05 67 50 00 00         leaq    0x5067(%rip), %rax      # 0x180007090
 DISASM-NEXT: 180002029: e9 00 00 00 00               jmp     0x18000202e <.text+0x102e>
-DISASM-NEXT: 18000202e: 51                           pushq   %rcx
-DISASM-NEXT: 18000202f: 52                           pushq   %rdx
-DISASM-NEXT: 180002030: 41 50                        pushq   %r8
-DISASM-NEXT: 180002032: 41 51                        pushq   %r9
-DISASM-NEXT: 180002034: 48 83 ec 48                  subq    $0x48, %rsp
-DISASM-NEXT: 180002038: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
-DISASM-NEXT: 18000203d: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
-DISASM-NEXT: 180002043: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
-DISASM-NEXT: 180002049: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
-DISASM-NEXT: 18000204f: 48 8b d0                     movq    %rax, %rdx
-DISASM-NEXT: 180002052: 48 8d 0d a7 21 00 00         leaq    0x21a7(%rip), %rcx      # 0x180004200
-DISASM-NEXT: 180002059: e8 aa ef ff ff               callq   0x180001008 <.text+0x8>
-DISASM-NEXT: 18000205e: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
-DISASM-NEXT: 180002063: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
-DISASM-NEXT: 180002069: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
-DISASM-NEXT: 18000206f: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
-DISASM-NEXT: 180002075: 48 83 c4 48                  addq    $0x48, %rsp
-DISASM-NEXT: 180002079: 41 59                        popq    %r9
-DISASM-NEXT: 18000207b: 41 58                        popq    %r8
-DISASM-NEXT: 18000207d: 5a                           popq    %rdx
-DISASM-NEXT: 18000207e: 59                           popq    %rcx
-DISASM-NEXT: 18000207f: ff e0                        jmpq    *%rax
+DISASM-NEXT: 18000202e: 48 89 4c 24 08               movq    %rcx, 0x8(%rsp)
+DISASM-NEXT: 180002033: 48 89 54 24 10               movq    %rdx, 0x10(%rsp)
+DISASM-NEXT: 180002038: 4c 89 44 24 18               movq    %r8, 0x18(%rsp)
+DISASM-NEXT: 18000203d: 4c 89 4c 24 20               movq    %r9, 0x20(%rsp)
+DISASM-NEXT: 180002042: 48 83 ec 68                  subq    $0x68, %rsp
+DISASM-NEXT: 180002046: 66 0f 7f 44 24 20            movdqa  %xmm0, 0x20(%rsp)
+DISASM-NEXT: 18000204c: 66 0f 7f 4c 24 30            movdqa  %xmm1, 0x30(%rsp)
+DISASM-NEXT: 180002052: 66 0f 7f 54 24 40            movdqa  %xmm2, 0x40(%rsp)
+DISASM-NEXT: 180002058: 66 0f 7f 5c 24 50            movdqa  %xmm3, 0x50(%rsp)
+DISASM-NEXT: 18000205e: 48 8b d0                     movq    %rax, %rdx
+DISASM-NEXT: 180002061: 48 8d 0d 90 21 00 00         leaq    0x2190(%rip), %rcx      # 0x1800041f8
+DISASM-NEXT: 180002068: e8 9b ef ff ff               callq   0x180001008 <.text+0x8>
+DISASM-NEXT: 18000206d: 66 0f 6f 44 24 20            movdqa  0x20(%rsp), %xmm0
+DISASM-NEXT: 180002073: 66 0f 6f 4c 24 30            movdqa  0x30(%rsp), %xmm1
+DISASM-NEXT: 180002079: 66 0f 6f 54 24 40            movdqa  0x40(%rsp), %xmm2
+DISASM-NEXT: 18000207f: 66 0f 6f 5c 24 50            movdqa  0x50(%rsp), %xmm3
+DISASM-NEXT: 180002085: 48 8b 4c 24 70               movq    0x70(%rsp), %rcx
+DISASM-NEXT: 18000208a: 48 8b 54 24 78               movq    0x78(%rsp), %rdx
+DISASM-NEXT: 18000208f: 4c 8b 84 24 80 00 00 00      movq    0x80(%rsp), %r8
+DISASM-NEXT: 180002097: 4c 8b 8c 24 88 00 00 00      movq    0x88(%rsp), %r9
+DISASM-NEXT: 18000209f: 48 83 c4 68                  addq    $0x68, %rsp
+DISASM-NEXT: 1800020a3: ff e0                        jmpq    *%rax
 
 RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s
 LOADCFG:      CHPEMetadata [
@@ -85,7 +85,7 @@ IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   Attributes: 0x1
 IMPORTS-NEXT:   ModuleHandle: 0x7080
 IMPORTS-NEXT:   ImportAddressTable: 0x7088
-IMPORTS-NEXT:   ImportNameTable: 0x4240
+IMPORTS-NEXT:   ImportNameTable: 0x4238
 IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:   Import {
@@ -141,7 +141,7 @@ RELOC-NEXT:     Address: 0x6008
 RELOC-NEXT:   }
 
 RUN: llvm-readobj --hex-dump=.pdata out.dll | FileCheck --check-prefix=PDATA %s
-PDATA: 0x180008000 2e200000 81200000 18400000
+PDATA: 0x180008000 2e200000 a5200000 18400000
 
 Verify that a demangled version of __delayLoadHelper2 can be used.
 
diff --git a/lld/test/COFF/arm64x-delayimport.test b/lld/test/COFF/arm64x-delayimport.test
index 56923ef748d0..2a68bce79baa 100644
--- a/lld/test/COFF/arm64x-delayimport.test
+++ b/lld/test/COFF/arm64x-delayimport.test
@@ -21,7 +21,7 @@ IMPORTS-NEXT:   Name: test.dll
 IMPORTS-NEXT:   Attributes: 0x1
 IMPORTS-NEXT:   ModuleHandle: 0x6080
 IMPORTS-NEXT:   ImportAddressTable: 0x6088
-IMPORTS-NEXT:   ImportNameTable: 0x4390
+IMPORTS-NEXT:   ImportNameTable: 0x4388
 IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:   Import {
@@ -35,7 +35,7 @@ IMPORTS-NEXT:     Name: test.dll
 IMPORTS-NEXT:     Attributes: 0x1
 IMPORTS-NEXT:     ModuleHandle: 0x6080
 IMPORTS-NEXT:     ImportAddressTable: 0x6098
-IMPORTS-NEXT:     ImportNameTable: 0x43A0
+IMPORTS-NEXT:     ImportNameTable: 0x4398
 IMPORTS-NEXT:     BoundDelayImportTable: 0x0
 IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
 IMPORTS-NEXT:     Import {
@@ -73,7 +73,7 @@ DISASM-NEXT: 180001040: ad0497e4     stp     q4, q5, [sp, #0x90]
 DISASM-NEXT: 180001044: ad059fe6     stp     q6, q7, [sp, #0xb0]
 DISASM-NEXT: 180001048: aa1103e1     mov     x1, x17
 DISASM-NEXT: 18000104c: f0000000     adrp    x0, 0x180004000
-DISASM-NEXT: 180001050: 910d4000     add     x0, x0, #0x350
+DISASM-NEXT: 180001050: 910d2000     add     x0, x0, #0x348
 DISASM-NEXT: 180001054: 97ffffeb     bl      0x180001000 <.text>
 DISASM-NEXT: 180001058: aa0003f0     mov     x16, x0
 DISASM-NEXT: 18000105c: ad459fe6     ldp     q6, q7, [sp, #0xb0]
@@ -105,28 +105,28 @@ DISASM-NEXT:                 ...
 DISASM-NEXT: 180003000: ff 25 92 30 00 00            jmpq    *0x3092(%rip)           # 0x180006098
 DISASM-NEXT: 180003006: 48 8d 05 8b 30 00 00         leaq    0x308b(%rip), %rax      # 0x180006098
 DISASM-NEXT: 18000300d: e9 00 00 00 00               jmp     0x180003012 <.text+0x2012>
-DISASM-NEXT: 180003012: 51                           pushq   %rcx
-DISASM-NEXT: 180003013: 52                           pushq   %rdx
-DISASM-NEXT: 180003014: 41 50                        pushq   %r8
-DISASM-NEXT: 180003016: 41 51                        pushq   %r9
-DISASM-NEXT: 180003018: 48 83 ec 48                  subq    $0x48, %rsp
-DISASM-NEXT: 18000301c: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
-DISASM-NEXT: 180003021: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
-DISASM-NEXT: 180003027: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
-DISASM-NEXT: 18000302d: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
-DISASM-NEXT: 180003033: 48 8b d0                     movq    %rax, %rdx
-DISASM-NEXT: 180003036: 48 8d 0d 13 13 00 00         leaq    0x1313(%rip), %rcx # 0x180004350
-DISASM-NEXT: 18000303d: e8 c6 ef ff ff               callq   0x180002008 <.text+0x1008>
-DISASM-NEXT: 180003042: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
-DISASM-NEXT: 180003047: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
-DISASM-NEXT: 18000304d: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
-DISASM-NEXT: 180003053: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
-DISASM-NEXT: 180003059: 48 83 c4 48                  addq    $0x48, %rsp
-DISASM-NEXT: 18000305d: 41 59                        popq    %r9
-DISASM-NEXT: 18000305f: 41 58                        popq    %r8
-DISASM-NEXT: 180003061: 5a                           popq    %rdx
-DISASM-NEXT: 180003062: 59                           popq    %rcx
-DISASM-NEXT: 180003063: ff e0                        jmpq    *%rax
+DISASM-NEXT: 180003012: 48 89 4c 24 08               movq    %rcx, 0x8(%rsp)
+DISASM-NEXT: 180003017: 48 89 54 24 10               movq    %rdx, 0x10(%rsp)
+DISASM-NEXT: 18000301c: 4c 89 44 24 18               movq    %r8, 0x18(%rsp)
+DISASM-NEXT: 180003021: 4c 89 4c 24 20               movq    %r9, 0x20(%rsp)
+DISASM-NEXT: 180003026: 48 83 ec 68                  subq    $0x68, %rsp
+DISASM-NEXT: 18000302a: 66 0f 7f 44 24 20            movdqa  %xmm0, 0x20(%rsp)
+DISASM-NEXT: 180003030: 66 0f 7f 4c 24 30            movdqa  %xmm1, 0x30(%rsp)
+DISASM-NEXT: 180003036: 66 0f 7f 54 24 40            movdqa  %xmm2, 0x40(%rsp)
+DISASM-NEXT: 18000303c: 66 0f 7f 5c 24 50            movdqa  %xmm3, 0x50(%rsp)
+DISASM-NEXT: 180003042: 48 8b d0                     movq    %rax, %rdx
+DISASM-NEXT: 180003045: 48 8d 0d fc 12 00 00         leaq    0x12fc(%rip), %rcx      # 0x180004348
+DISASM-NEXT: 18000304c: e8 b7 ef ff ff               callq   0x180002008 <.text+0x1008>
+DISASM-NEXT: 180003051: 66 0f 6f 44 24 20            movdqa  0x20(%rsp), %xmm0
+DISASM-NEXT: 180003057: 66 0f 6f 4c 24 30            movdqa  0x30(%rsp), %xmm1
+DISASM-NEXT: 18000305d: 66 0f 6f 54 24 40            movdqa  0x40(%rsp), %xmm2
+DISASM-NEXT: 180003063: 66 0f 6f 5c 24 50            movdqa  0x50(%rsp), %xmm3
+DISASM-NEXT: 180003069: 48 8b 4c 24 70               movq    0x70(%rsp), %rcx
+DISASM-NEXT: 18000306e: 48 8b 54 24 78               movq    0x78(%rsp), %rdx
+DISASM-NEXT: 180003073: 4c 8b 84 24 80 00 00 00      movq    0x80(%rsp), %r8
+DISASM-NEXT: 18000307b: 4c 8b 8c 24 88 00 00 00      movq    0x88(%rsp), %r9
+DISASM-NEXT: 180003083: 48 83 c4 68                  addq    $0x68, %rsp
+DISASM-NEXT: 180003087: ff e0                        jmpq    *%rax
 
 RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s
 LOADCFG:      AuxiliaryDelayloadIAT: 0x5000
@@ -230,7 +230,7 @@ EC-IMPORTS-NEXT:   Name: test.dll
 EC-IMPORTS-NEXT:   Attributes: 0x1
 EC-IMPORTS-NEXT:   ModuleHandle: 0x6080
 EC-IMPORTS-NEXT:   ImportAddressTable: 0x6088
-EC-IMPORTS-NEXT:   ImportNameTable: 0x4388
+EC-IMPORTS-NEXT:   ImportNameTable: 0x4380
 EC-IMPORTS-NEXT:   BoundDelayImportTable: 0x0
 EC-IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
 EC-IMPORTS-NEXT: }
@@ -243,7 +243,7 @@ EC-IMPORTS-NEXT:     Name: test.dll
 EC-IMPORTS-NEXT:     Attributes: 0x1
 EC-IMPORTS-NEXT:     ModuleHandle: 0x6080
 EC-IMPORTS-NEXT:     ImportAddressTable: 0x6090
-EC-IMPORTS-NEXT:     ImportNameTable: 0x4390
+EC-IMPORTS-NEXT:     ImportNameTable: 0x4388
 EC-IMPORTS-NEXT:     BoundDelayImportTable: 0x0
 EC-IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
 EC-IMPORTS-NEXT:     Import {
@@ -279,28 +279,28 @@ EC-DISASM-NEXT:                 ...
 EC-DISASM-NEXT: 180003000: ff 25 8a 30 00 00            jmpq    *0x308a(%rip)           # 0x180006090
 EC-DISASM-NEXT: 180003006: 48 8d 05 83 30 00 00         leaq    0x3083(%rip), %rax      # 0x180006090
 EC-DISASM-NEXT: 18000300d: e9 00 00 00 00               jmp     0x180003012 <.text+0x2012>
-EC-DISASM-NEXT: 180003012: 51                           pushq   %rcx
-EC-DISASM-NEXT: 180003013: 52                           pushq   %rdx
-EC-DISASM-NEXT: 180003014: 41 50                        pushq   %r8
-EC-DISASM-NEXT: 180003016: 41 51                        pushq   %r9
-EC-DISASM-NEXT: 180003018: 48 83 ec 48                  subq    $0x48, %rsp
-EC-DISASM-NEXT: 18000301c: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
-EC-DISASM-NEXT: 180003021: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
-EC-DISASM-NEXT: 180003027: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
-EC-DISASM-NEXT: 18000302d: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
-EC-DISASM-NEXT: 180003033: 48 8b d0                     movq    %rax, %rdx
-EC-DISASM-NEXT: 180003036: 48 8d 0d 0b 13 00 00         leaq    0x130b(%rip), %rcx      # 0x180004348
-EC-DISASM-NEXT: 18000303d: e8 c6 ef ff ff               callq   0x180002008 <.text+0x1008>
-EC-DISASM-NEXT: 180003042: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
-EC-DISASM-NEXT: 180003047: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
-EC-DISASM-NEXT: 18000304d: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
-EC-DISASM-NEXT: 180003053: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
-EC-DISASM-NEXT: 180003059: 48 83 c4 48                  addq    $0x48, %rsp
-EC-DISASM-NEXT: 18000305d: 41 59                        popq    %r9
-EC-DISASM-NEXT: 18000305f: 41 58                        popq    %r8
-EC-DISASM-NEXT: 180003061: 5a                           popq    %rdx
-EC-DISASM-NEXT: 180003062: 59                           popq    %rcx
-EC-DISASM-NEXT: 180003063: ff e0                        jmpq    *%rax
+EC-DISASM-NEXT: 180003012: 48 89 4c 24 08               movq    %rcx, 0x8(%rsp)
+EC-DISASM-NEXT: 180003017: 48 89 54 24 10               movq    %rdx, 0x10(%rsp)
+EC-DISASM-NEXT: 18000301c: 4c 89 44 24 18               movq    %r8, 0x18(%rsp)
+EC-DISASM-NEXT: 180003021: 4c 89 4c 24 20               movq    %r9, 0x20(%rsp)
+EC-DISASM-NEXT: 180003026: 48 83 ec 68                  subq    $0x68, %rsp
+EC-DISASM-NEXT: 18000302a: 66 0f 7f 44 24 20            movdqa  %xmm0, 0x20(%rsp)
+EC-DISASM-NEXT: 180003030: 66 0f 7f 4c 24 30            movdqa  %xmm1, 0x30(%rsp)
+EC-DISASM-NEXT: 180003036: 66 0f 7f 54 24 40            movdqa  %xmm2, 0x40(%rsp)
+EC-DISASM-NEXT: 18000303c: 66 0f 7f 5c 24 50            movdqa  %xmm3, 0x50(%rsp)
+EC-DISASM-NEXT: 180003042: 48 8b d0                     movq    %rax, %rdx
+EC-DISASM-NEXT: 180003045: 48 8d 0d f4 12 00 00         leaq    0x12f4(%rip), %rcx      # 0x180004340
+EC-DISASM-NEXT: 18000304c: e8 b7 ef ff ff               callq   0x180002008 <.text+0x1008>
+EC-DISASM-NEXT: 180003051: 66 0f 6f 44 24 20            movdqa  0x20(%rsp), %xmm0
+EC-DISASM-NEXT: 180003057: 66 0f 6f 4c 24 30            movdqa  0x30(%rsp), %xmm1
+EC-DISASM-NEXT: 18000305d: 66 0f 6f 54 24 40            movdqa  0x40(%rsp), %xmm2
+EC-DISASM-NEXT: 180003063: 66 0f 6f 5c 24 50            movdqa  0x50(%rsp), %xmm3
+EC-DISASM-NEXT: 180003069: 48 8b 4c 24 70               movq    0x70(%rsp), %rcx
+EC-DISASM-NEXT: 18000306e: 48 8b 54 24 78               movq    0x78(%rsp), %rdx
+EC-DISASM-NEXT: 180003073: 4c 8b 84 24 80 00 00 00      movq    0x80(%rsp), %r8
+EC-DISASM-NEXT: 18000307b: 4c 8b 8c 24 88 00 00 00      movq    0x88(%rsp), %r9
+EC-DISASM-NEXT: 180003083: 48 83 c4 68                  addq    $0x68, %rsp
+EC-DISASM-NEXT: 180003087: ff e0                        jmpq    *%rax
 
 RUN: llvm-readobj --coff-load-config out-ec.dll | FileCheck --check-prefix=EC-LOADCFG %s
 EC-LOADCFG:      AuxiliaryDelayloadIAT: 0x5000
diff --git a/lld/test/COFF/delayimports.test b/lld/test/COFF/delayimports.test
index f410eef35fd1..ed074f462c7d 100644
--- a/lld/test/COFF/delayimports.test
+++ b/lld/test/COFF/delayimports.test
@@ -10,7 +10,7 @@ IMPORT-NEXT:   Name: std64.dll
 IMPORT-NEXT:   Attributes: 0x1
 IMPORT-NEXT:   ModuleHandle: 0x3018
 IMPORT-NEXT:   ImportAddressTable: 0x3020
-IMPORT-NEXT:   ImportNameTable: 0x2050
+IMPORT-NEXT:   ImportNameTable: 0x2048
 IMPORT-NEXT:   BoundDelayImportTable: 0x0
 IMPORT-NEXT:   UnloadDelayImportTable: 0x0
 IMPORT-NEXT:   Import {
@@ -44,22 +44,18 @@ BASEREL-NEXT:   }
 UNWIND:      UnwindInformation [
 UNWIND-NEXT:   RuntimeFunction {
 UNWIND-NEXT:     StartAddress: (0x14000108A)
-UNWIND-NEXT:     EndAddress: (0x1400010DD)
+UNWIND-NEXT:     EndAddress: (0x140001101)
 UNWIND-NEXT:     UnwindInfoAddress: (0x140002000)
 UNWIND-NEXT:     UnwindInfo {
 UNWIND-NEXT:       Version: 1
 UNWIND-NEXT:       Flags [ (0x0)
 UNWIND-NEXT:       ]
-UNWIND-NEXT:       PrologSize: 10
+UNWIND-NEXT:       PrologSize: 24
 UNWIND-NEXT:       FrameRegister: -
 UNWIND-NEXT:       FrameOffset: -
-UNWIND-NEXT:       UnwindCodeCount: 5
+UNWIND-NEXT:       UnwindCodeCount: 1
 UNWIND-NEXT:       UnwindCodes [
-UNWIND-NEXT:         0x0A: ALLOC_SMALL size=72
-UNWIND-NEXT:         0x06: ALLOC_SMALL size=8
-UNWIND-NEXT:         0x04: ALLOC_SMALL size=8
-UNWIND-NEXT:         0x02: ALLOC_SMALL size=8
-UNWIND-NEXT:         0x01: ALLOC_SMALL size=8
+UNWIND-NEXT:         0x18: ALLOC_SMALL size=104
 UNWIND-NEXT:       ]
 UNWIND-NEXT:     }
 UNWIND-NEXT:   }
diff --git a/lld/test/COFF/delayimporttables.yaml b/lld/test/COFF/delayimporttables.yaml
index cf54c0a7140a..ff6681257c83 100644
--- a/lld/test/COFF/delayimporttables.yaml
+++ b/lld/test/COFF/delayimporttables.yaml
@@ -15,7 +15,7 @@
 # CHECK-NEXT:   Attributes: 0x1
 # CHECK-NEXT:   ModuleHandle: 0x3000
 # CHECK-NEXT:   ImportAddressTable: 0x3010
-# CHECK-NEXT:   ImportNameTable: 0x2070
+# CHECK-NEXT:   ImportNameTable: 0x2068
 # CHECK-NEXT:   BoundDelayImportTable: 0x0
 # CHECK-NEXT:   UnloadDelayImportTable: 0x0
 # CHECK-NEXT:   Import {
@@ -32,16 +32,16 @@
 # CHECK-NEXT:   Attributes: 0x1
 # CHECK-NEXT:   ModuleHandle: 0x3008
 # CHECK-NEXT:   ImportAddressTable: 0x3028
-# CHECK-NEXT:   ImportNameTable: 0x2088
+# CHECK-NEXT:   ImportNameTable: 0x2080
 # CHECK-NEXT:   BoundDelayImportTable: 0x0
 # CHECK-NEXT:   UnloadDelayImportTable: 0x0
 # CHECK-NEXT:   Import {
 # CHECK-NEXT:     Symbol: left (0)
-# CHECK-NEXT:     Address: 0x1400010B8
+# CHECK-NEXT:     Address: 0x1400010DC
 # CHECK-NEXT:   }
 # CHECK-NEXT:   Import {
 # CHECK-NEXT:     Symbol: right (0)
-# CHECK-NEXT:     Address: 0x1400010C4
+# CHECK-NEXT:     Address: 0x1400010E8
 # CHECK-NEXT:   }
 # CHECK-NEXT: }
 
diff --git a/lld/test/COFF/dtlto/files.test b/lld/test/COFF/dtlto/files.test
new file mode 100644
index 000000000000..4297adac9bbf
--- /dev/null
+++ b/lld/test/COFF/dtlto/files.test
@@ -0,0 +1,71 @@
+REQUIRES: x86
+
+## Test that the LLD options /lldsavetemps and -thinlto-emit-imports-files
+## function correctly with DTLTO we also check that index files 
+## (-thinlto-emit-index-files) are not emitted with DTLTO.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: sed 's/@t1/@t2/g' t1.ll > t2.ll
+
+## Generate ThinLTO bitcode files. Note that t3.bc will not be used by the
+## linker.
+RUN: opt -thinlto-bc t1.ll -o t1.bc
+RUN: opt -thinlto-bc t2.ll -o t2.bc
+RUN: cp t1.bc t3.bc
+
+## Generate object files for mock.py to return.
+RUN: llc t1.ll --filetype=obj -o t1.obj
+RUN: llc t2.ll --filetype=obj -o t2.obj
+
+## Create response file containing shared ThinLTO linker arguments.
+## -start-lib/-end-lib is used to test the special case where unused lazy
+## bitcode inputs result in empty index/imports files.
+## Note that mock.py does not do any compilation; instead, it simply writes
+## the contents of the object files supplied on the command line into the
+## output object files in job order.
+RUN: echo "/entry:t1 /subsystem:console \
+RUN:   t1.bc t2.bc -start-lib t3.bc -end-lib /out:my.exe \
+RUN:   -thinlto-distributor:\"%python\" \
+RUN:   -thinlto-distributor-arg:\"%llvm_src_root/utils/dtlto/mock.py\" \
+RUN:   -thinlto-distributor-arg:t1.obj \
+RUN:   -thinlto-distributor-arg:t2.obj \
+RUN:   -thinlto-remote-compiler:fake.exe" > l.rsp
+
+## Check that without extra flags, no index/imports files are produced and
+## backend temp files are removed.
+RUN: lld-link @l.rsp
+RUN: ls | FileCheck %s \
+RUN:   --check-prefixes=NOBACKEND,NOOTHERS
+
+## Check that with /lldsavetemps and -thinlto-emit-imports-files backend 
+## tempoary files are retained and no index/imports files are produced.
+RUN: rm -f *.imports *.thinlto.bc
+RUN: lld-link @l.rsp  /lldsavetemps -thinlto-emit-imports-files
+RUN: ls | sort | FileCheck %s \
+RUN:   --check-prefixes=BACKEND,NOOTHERS
+
+## JSON jobs description, retained with --save-temps.
+## Note that DTLTO temporary files include a PID component.
+NOBACKEND-NOT: {{^}}my.[[#]].dist-file.json{{$}}
+BACKEND:       {{^}}my.[[#]].dist-file.json{{$}}
+
+## Index/imports files for t1.bc.
+NOOTHERS-NOT: {{^}}t1.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t1.bc.thinlto.bc{{$}}
+
+## Index/imports files for t2.bc.
+NOOTHERS-NOT: {{^}}t2.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t2.bc.thinlto.bc{{$}}
+
+## Empty index/imports files for unused t3.bc.
+NOOTHERS-NOT: {{^}}t3.bc.imports{{$}}
+NOOTHERS-NOT: {{^}}t3.bc.thinlto.bc{{$}}
+
+#--- t1.ll
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @t1() {
+  ret void
+}
diff --git a/lld/test/COFF/dtlto/options.test b/lld/test/COFF/dtlto/options.test
new file mode 100644
index 000000000000..023ecd235910
--- /dev/null
+++ b/lld/test/COFF/dtlto/options.test
@@ -0,0 +1,56 @@
+REQUIRES: x86
+
+## Test that DTLTO-specific options are handled correctly.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: opt -thinlto-bc foo.ll -o foo.obj
+
+## Not specifying a value for -thinlto-remote-compiler should result in an
+## error if -thinlto-distributor is specified.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:fake.exe 2>&1 | FileCheck %s --check-prefix=COMPILER
+RUN: lld-link /entry:foo /subsystem:console foo.obj /out:my.exe
+
+## Specifying an empty value for -thinlto-remote-compiler should result in an
+## error if -thinlto-distributor is specified.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:fake.exe \
+RUN:   -thinlto-remote-compiler:"" 2>&1 | FileCheck %s --check-prefix=COMPILER
+RUN: lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-remote-compiler:""
+
+COMPILER: error: A value must be specified for /thinlto-remote-compiler if /thinlto-distributor is specified.
+
+## Test that DTLTO options are passed correctly to the distributor and
+## remote compiler.
+## Note: validate.py does not perform any compilation. Instead, it validates the
+## received JSON, pretty-prints the JSON and the supplied arguments, and then
+## exits with an error. This allows FileCheck directives to verify the
+## distributor inputs.
+RUN: not lld-link /entry:foo /subsystem:console foo.obj /out:my.exe \
+RUN:   -thinlto-distributor:%python \
+RUN:   -thinlto-distributor-arg:%llvm_src_root/utils/dtlto/validate.py \
+RUN:   -thinlto-distributor-arg:darg1=10 \
+RUN:   -thinlto-distributor-arg:darg2=20 \
+RUN:   -thinlto-remote-compiler:my_clang.exe \
+RUN:   -thinlto-remote-compiler-arg:carg1=20 \
+RUN:   -thinlto-remote-compiler-arg:carg2=30 2>&1 | FileCheck %s
+
+CHECK: distributor_args=['darg1=10', 'darg2=20']
+
+CHECK: "linker_output": "my.exe"
+
+CHECK: "my_clang.exe"
+CHECK: "carg1=20"
+CHECK: "carg2=30"
+
+CHECK: error: DTLTO backend compilation: cannot open native object file:
+
+#--- foo.ll
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @foo() {
+  ret void
+}
diff --git a/lld/test/COFF/giats.s b/lld/test/COFF/giats.s
index f870429f39d8..c0442107d3ae 100644
--- a/lld/test/COFF/giats.s
+++ b/lld/test/COFF/giats.s
@@ -37,14 +37,14 @@
 
 # DELAY-CHECK: ImageBase: 0x140000000
 # DELAY-CHECK: LoadConfig [
-# DELAY-CHECK:   GuardCFFunctionTable: 0x140002124
+# DELAY-CHECK:   GuardCFFunctionTable: 0x14000211C
 # DELAY-CHECK:   GuardCFFunctionCount: 2
 # DELAY-CHECK:   GuardFlags [ (0x10500)
 # DELAY-CHECK:     CF_FUNCTION_TABLE_PRESENT (0x400)
 # DELAY-CHECK:     CF_INSTRUMENTED (0x100)
 # DELAY-CHECK:     CF_LONGJUMP_TABLE_PRESENT (0x10000)
 # DELAY-CHECK:   ]
-# DELAY-CHECK:   GuardAddressTakenIatEntryTable: 0x14000212C
+# DELAY-CHECK:   GuardAddressTakenIatEntryTable: 0x140002124
 # DELAY-CHECK:   GuardAddressTakenIatEntryCount: 1
 # DELAY-CHECK: ]
 # DELAY-CHECK:      GuardFidTable [
diff --git a/lld/test/COFF/imported-dllmain-i386.test b/lld/test/COFF/imported-dllmain-i386.test
new file mode 100644
index 000000000000..f8aa09006999
--- /dev/null
+++ b/lld/test/COFF/imported-dllmain-i386.test
@@ -0,0 +1,58 @@
+REQUIRES: x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows a.s -o a.obj
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows b1.s -o b1.obj
+RUN: llvm-mc -filetype=obj -triple=i386-windows b2.s -o b2.obj
+
+### This is the line where our problem occurs. Here, we export the DllMain symbol which shouldn't happen normally.
+RUN: lld-link b1.obj b2.obj -out:b.dll -dll -implib:b.lib -entry:DllMain -export:bar -export:DllMain -safeseh:no
+
+RUN: llvm-mc -filetype=obj -triple=i386-windows c.s -o c.obj
+RUN: lld-link -lib c.obj -out:c.lib
+
+### Later, if b.lib is provided before other libs/objs that export DllMain statically, we previously were using the dllimported DllMain from b.lib, which is wrong.
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -safeseh:no 2>&1 | FileCheck -check-prefix=WARN %s
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:importeddllmain -safeseh:no 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
+RUN: llvm-objdump --private-headers -d out.dll | FileCheck -check-prefix=DISASM %s
+
+WARN: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+IGNORED-NOT: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+
+DISASM: The Import Tables:
+DISASM: DLL Name: b.dll
+DISASM-NOT: DllMain
+DISASM: bar
+DISASM: Disassembly of section .text:
+DISASM:      b0 01                         movb    $0x1, %al
+DISASM-NEXT: c3                            retl
+
+#--- a.s
+        .text
+        .globl _foo
+_foo:
+        call *__imp__bar
+        ret
+
+#--- b1.s
+        .text
+        .globl _bar
+_bar:
+        ret
+
+#--- b2.s
+        .intel_syntax noprefix
+        .text
+        .globl _DllMain
+_DllMain:
+        xor al, al
+        ret
+
+#--- c.s
+        .intel_syntax noprefix
+        .text
+        .globl _DllMain
+_DllMain:
+        mov al, 1 
+        ret
diff --git a/lld/test/COFF/exported-dllmain.test b/lld/test/COFF/imported-dllmain.test
similarity index 86%
rename from lld/test/COFF/exported-dllmain.test
rename to lld/test/COFF/imported-dllmain.test
index fcf6ed100537..fa8579b1b41c 100644
--- a/lld/test/COFF/exported-dllmain.test
+++ b/lld/test/COFF/imported-dllmain.test
@@ -14,11 +14,11 @@ RUN: lld-link -lib c.obj -out:c.lib
 
 ### Later, if b.lib is provided before other libs/objs that export DllMain statically, we previously were using the dllimported DllMain from b.lib, which is wrong.
 RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain 2>&1 | FileCheck -check-prefix=WARN %s
-RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:exporteddllmain 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
+RUN: lld-link a.obj b.lib c.lib -dll -out:out.dll -entry:DllMain -ignore:importeddllmain 2>&1 | FileCheck -check-prefix=IGNORED --allow-empty %s
 RUN: llvm-objdump --private-headers -d out.dll | FileCheck -check-prefix=DISASM %s
 
-WARN: lld-link: warning: b.lib: skipping exported DllMain symbol [exporteddllmain]
-IGNORED-NOT: lld-link: warning: b.lib: skipping exported DllMain symbol [exporteddllmain]
+WARN: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
+IGNORED-NOT: lld-link: warning: b.lib: skipping imported DllMain symbol [importeddllmain]
 
 DISASM: The Import Tables:
 DISASM: DLL Name: b.dll
diff --git a/lld/test/COFF/pdb-empty-sec.s b/lld/test/COFF/pdb-empty-sec.s
new file mode 100644
index 000000000000..0d61447b7665
--- /dev/null
+++ b/lld/test/COFF/pdb-empty-sec.s
@@ -0,0 +1,19 @@
+// REQUIRES: x86
+
+// RUN: llvm-mc -filetype=obj -triple=x86_64-windows %s -o %t.obj
+// RUN: lld-link -dll -noentry -debug %t.obj -out:%t.dll
+// RUN: llvm-pdbutil dump -publics %t.pdb | FileCheck %s
+
+// CHECK:       Records
+// CHECK-NEXT:       0 | S_PUB32 [size = 20] `func`
+// CHECK-NEXT:           flags = none, addr = 0001:0000
+// CHECK-NEXT:      20 | S_PUB32 [size = 20] `sym`
+// CHECK-NEXT:           flags = none, addr = 0000:0000
+
+        .globl sym
+        .data
+sym:
+        .text
+        .globl func
+func:
+        ret
diff --git a/lld/test/COFF/thin-archive.s b/lld/test/COFF/thin-archive.s
index 55d71ea63567..7fab10c2b57b 100644
--- a/lld/test/COFF/thin-archive.s
+++ b/lld/test/COFF/thin-archive.s
@@ -22,23 +22,34 @@
 # SYMTAB:        ?f@@YAHXZ in
 # NO-SYMTAB-NOT: ?f@@YAHXZ in
 
-# RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: lld-link /entry:main %t.main.obj %t_thin.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: lld-link /entry:main %t.main.obj /wholearchive:%t_thin.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
+# RUN: echo "/entry:main \"%t.main.obj\" /out:\"%t.exe\"" > %t.rsp
+
+# RUN: lld-link @%t.rsp %t.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_NON_THIN
+# RUN: lld-link @%t.rsp %t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_SYM
+# RUN: lld-link @%t.rsp /wholearchive:%t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_WHOLE
+# RUN: lld-link @%t.rsp /wholearchive %t_thin.lib /verbose 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=LOAD_THIN_WHOLE
+
+# LOAD_NON_THIN:   Loaded {{.*}}.lib({{.*}}.obj) for int __cdecl f(void)
+# LOAD_THIN_SYM:   Loaded {{.*}}.obj for int __cdecl f(void)
+# LOAD_THIN_WHOLE: Loaded {{.*}}.obj for <whole-archive>
 
 # RUN: rm %t.lib.obj
-# RUN: lld-link /entry:main %t.main.obj %t.lib /out:%t.exe 2>&1 | \
-# RUN:     FileCheck --allow-empty %s
-# RUN: env LLD_IN_TEST=1 not lld-link /entry:main %t.main.obj %t_thin.lib \
-# RUN:     /out:%t.exe 2>&1 | FileCheck --check-prefix=NOOBJ %s
-# RUN: env LLD_IN_TEST=1 not lld-link /entry:main %t.main.obj %t_thin.lib /out:%t.exe \
-# RUN:     /demangle:no 2>&1 | FileCheck --check-prefix=NOOBJNODEMANGLE %s
-
-# CHECK-NOT: error: could not get the buffer for the member defining
+# RUN: lld-link @%t.rsp %t.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=ERR --allow-empty
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp %t_thin.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJ
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp /wholearchive:%t_thin.lib 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJWHOLE
+# RUN: env LLD_IN_TEST=1 not lld-link @%t.rsp %t_thin.lib /demangle:no 2>&1 | \
+# RUN:     FileCheck %s --check-prefix=NOOBJNODEMANGLE
+
+# ERR-NOT: error: could not get the buffer for the member defining
 # NOOBJ: error: could not get the buffer for the member defining symbol int __cdecl f(void): {{.*}}.lib({{.*}}.lib.obj):
+# NOOBJWHOLE: error: {{.*}}.lib: could not get the buffer for a child of the archive: '{{.*}}.obj'
 # NOOBJNODEMANGLE: error: could not get the buffer for the member defining symbol ?f@@YAHXZ: {{.*}}.lib({{.*}}.lib.obj):
 
 	.text
diff --git a/lld/test/ELF/dtlto/archive-thin.test b/lld/test/ELF/dtlto/archive-thin.test
new file mode 100644
index 000000000000..df3c2aadb06e
--- /dev/null
+++ b/lld/test/ELF/dtlto/archive-thin.test
@@ -0,0 +1,65 @@
+REQUIRES: x86
+
+## Test that a DTLTO link assigns Module IDs to thin archive members as expected.
+
+RUN: rm -rf %t && split-file %s %t && cd %t
+
+RUN: sed 's/@t1/@t2/g' t1.ll > t2.ll
+RUN: sed 's/@t1/@t3/g' t1.ll > t3.ll
+
+RUN: opt -thinlto-bc t1.ll -o t1.bc
+RUN: opt -thinlto-bc t2.ll -o t2.bc
+RUN: opt -thinlto-bc t3.ll -o t3.bc
+
+RUN: llvm-ar rcs t1.a t1.bc --thin
+## Create this bitcode thin archive in a subdirectory to test the expansion of
+## the path to a bitcode file that is referenced using "..", e.g., in this case
+## "../t2.bc".
+RUN: mkdir lib
+RUN: llvm-ar rcs lib/t2.a t2.bc --thin
+## Create this bitcode thin archive with an absolute path entry containing "..".
+RUN: llvm-ar rcs t3.a %t/lib/../t3.bc --thin
+
+## Link from a different directory to ensure that thin archive member paths are
+## resolved correctly relative to the archive locations.
+RUN: mkdir %t/out && cd %t/out
+
+## Build a response file to share common linking arguments.
+## Note: validate.py does not perform any compilation. Instead, it validates the
+## received JSON, pretty-prints the JSON and the supplied arguments, and then
+## exits with an error. This allows FileCheck directives to verify the
+## distributor inputs.
+RUN: echo "%t/t1.a %t/lib/t2.a ../t3.a \
+RUN:   --thinlto-distributor=\"%python\" \
+RUN:   --thinlto-distributor-arg=\"%llvm_src_root/utils/dtlto/validate.py\"" > rsp
+
+## Link thin archives using -u/--undefined.
+RUN: not ld.lld @rsp -u t1 -u t2 -u t3 2>&1 | FileCheck %s
+
+## Link thin archives using --whole-archive.
+RUN: not ld.lld --whole-archive @rsp 2>&1 | FileCheck %s
+
+## Check the module IDs in the JSON jobs description.
+CHECK: "jobs": [
+CHECK: "inputs": [
+CHECK-NEXT: "{{([a-zA-Z]:)|/}}
+CHECK-SAME: {{/|\\\\}}archive-thin.test.tmp{{/|\\\\}}t1.bc"
+
+CHECK: "inputs": [
+CHECK-NEXT: "{{([a-zA-Z]\:)|/}}
+CHECK-SAME: {{/|\\\\}}archive-thin.test.tmp{{/|\\\\}}t2.bc"
+
+CHECK: "inputs": [
+CHECK-NEXT: "{{([a-zA-Z]:)|/}}
+CHECK-SAME: {{/|\\\\}}archive-thin.test.tmp{{/|\\\\}}t3.bc"
+
+## Ensure backend compilation fails as expected (due to validate.py dummy behavior).
+CHECK: error: DTLTO backend compilation: cannot open native object file:
+
+#--- t1.ll
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @t1() {
+  ret void
+}
diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s
deleted file mode 100644
index 53860b5daf2b..000000000000
--- a/lld/test/ELF/hexagon-jump-error.s
+++ /dev/null
@@ -1,32 +0,0 @@
-# REQUIRES: hexagon
-# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
-## Use --threads=1 to keep emitted warnings across sections sequential.
-# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
-
-	.globl	_start
-	.type	_start, @function
-_start:
-
-# CHECK: relocation R_HEX_B9_PCREL out of range: 1028 is not in [-1024, 1023]
-{r0 = #0; jump #1f}
-.space (1<<10)
-.section b9, "ax"
-1:
-
-# CHECK: relocation R_HEX_B13_PCREL out of range: 16388 is not in [-16384, 16383]
-if (r0==#0) jump:t #1f
-.space (1<<14)
-.section b13, "ax"
-1:
-
-# CHECK: relocation R_HEX_B15_PCREL out of range: 65540 is not in [-65536, 65535]
-if (p0) jump #1f
-.space (1<<16)
-.section b15, "ax"
-1:
-
-# CHECK: relocation R_HEX_B22_PCREL out of range: 8388612 is not in [-8388608, 8388607]
-jump #1f
-.space (1<<23)
-.section b22, "ax"
-1:
diff --git a/lld/test/ELF/hexagon-plt.s b/lld/test/ELF/hexagon-plt.s
index 679de82923a7..780dc434a669 100644
--- a/lld/test/ELF/hexagon-plt.s
+++ b/lld/test/ELF/hexagon-plt.s
@@ -30,31 +30,31 @@
 # DIS:      <_start>:
 ## Direct call
 ## Call foo directly
-# DIS-NEXT:   { call 0x2003c }
+# DIS-NEXT:   { call 0x2003c <foo> }
 ## Call bar via plt
-# DIS-NEXT:   { call 0x20060 }
+# DIS-NEXT:   { call 0x20060 <bar@plt> }
 ## Call weak via plt
-# DIS-NEXT:   { call 0x20070 }
+# DIS-NEXT:   { call 0x20070 <weak@plt> }
 # DIS-NEXT: { 	immext(#0)
 
 ## Call foo directly
-# DIS-NEXT: if (p0) jump:nt 0x2003c }
+# DIS-NEXT: if (p0) jump:nt 0x2003c <foo> }
 # DIS-NEXT: { 	immext(#64)
 ## Call bar via plt
-# DIS-NEXT: if (p0) jump:nt 0x20060 }
+# DIS-NEXT: if (p0) jump:nt 0x20060 <bar@plt> }
 # DIS-NEXT: { 	immext(#64)
 ## Call weak via plt
-# DIS-NEXT: if (p0) jump:nt 0x20070 }
+# DIS-NEXT: if (p0) jump:nt 0x20070 <weak@plt> }
 # DIS-NEXT: { 	immext(#0)
 
 ## Call foo directly
-# DIS-NEXT: r0 = #0 ; jump 0x2003c }
+# DIS-NEXT: r0 = #0 ; jump 0x2003c <foo> }
 # DIS-NEXT: { 	immext(#0)
 ## Call bar via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20060 }
+# DIS-NEXT: r0 = #0 ; jump 0x20060 <bar@plt> }
 # DIS-NEXT: { 	immext(#0)
 ## Call weak via plt
-# DIS-NEXT: r0 = #0 ; jump 0x20070 }
+# DIS-NEXT: r0 = #0 ; jump 0x20070 <weak@plt> }
 
 # DIS:      <foo>:
 # DIS-NEXT:   2003c:
diff --git a/lld/test/ELF/hexagon-shared.s b/lld/test/ELF/hexagon-shared.s
index cc62662d278e..7f7390f1fa8d 100644
--- a/lld/test/ELF/hexagon-shared.s
+++ b/lld/test/ELF/hexagon-shared.s
@@ -88,7 +88,7 @@ pvar:
 # PLT-NEXT: jumpr r28 }
 
 # TEXT:  bc 00 01 00 000100bc
-# TEXT: { 	call 0x10300 }
+# TEXT: { 	call 0x10300 <bar@plt> }
 # TEXT: if (p0) jump:nt 0x10300
 # TEXT: r0 = #0 ; jump 0x10300
 # TEXT: r0 = add(r1,##-65548)
diff --git a/lld/test/ELF/hexagon-thunk-range-b22rel.s b/lld/test/ELF/hexagon-thunk-range-b22rel.s
new file mode 100644
index 000000000000..08e37bf0a555
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-b22rel.s
@@ -0,0 +1,115 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld main.o -o test
+# RUN: llvm-objdump -d --no-show-raw-insn test | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_B22_PCREL relocations.
+## R_HEX_B22_PCREL has a range of +/- 8MB (0x800000 bytes).
+
+#--- main.s
+.globl _start
+.type _start, %function
+_start:
+  call target_within_range_max
+  call target_beyond_range
+  call target_within_range_min
+  call target_beyond_range_min
+  call target_multiple_calls
+  call target_multiple_calls
+  call target_close
+  jumpr r31
+
+target_close:
+  jumpr r31
+
+## Target at maximum positive range (8MB - 4 bytes from _start)
+## We need to account for the instructions above: 7 calls + 1 jumpr = 8 * 4 = 32 bytes
+.skip 0X7fffbc
+.globl target_within_range_max
+.type target_within_range_max, %function
+target_within_range_max:
+  jumpr r31
+
+## Target just beyond maximum positive range (needs thunk)
+.skip 8
+.globl target_beyond_range
+.type target_beyond_range, %function
+target_beyond_range:
+  call target_within_range_max
+  jumpr r31
+
+## Target for multiple calls test
+.skip 0x100000
+.globl target_multiple_calls
+.type target_multiple_calls, %function
+target_multiple_calls:
+  jumpr r31
+
+## Now place targets at maximum negative range
+## We'll put these before _start in memory layout
+.section .text_negative, "ax", %progbits
+
+## Target at maximum negative range (-8MB + 4 bytes from _start)
+.globl target_within_range_min
+.type target_within_range_min, %function
+target_within_range_min:
+  call target_close
+  jumpr r31
+
+.skip 0X7ffff4
+
+## Target beyond maximum negative range (needs thunk)
+.globl target_beyond_range_min
+.type target_beyond_range_min, %function
+target_beyond_range_min:
+  jumpr r31
+
+## Verify thunk generation for targets beyond B22_PCREL range
+# CHECK:       <__hexagon_thunk_target_within_range_min_from_.text.thunk>:
+# CHECK-NEXT:    200b4: { immext(#0x900000)
+# CHECK-NEXT:             jump 0x9200cc <target_within_range_min> }
+
+# CHECK:       <__hexagon_thunk_target_beyond_range_min_from_.text.thunk>:
+# CHECK-NEXT:    200bc: { immext(#0x1100000)
+# CHECK-NEXT:             jump 0x11200c8 <target_beyond_range_min> }
+
+# CHECK:       <__hexagon_thunk_target_multiple_calls_from_.text.thunk>:
+# CHECK-NEXT:    200c4: { immext(#0x8fffc0)
+# CHECK-NEXT:             jump 0x9200c0 <target_multiple_calls> }
+
+## Verify _start calls - some direct, some via thunks
+# CHECK:       <_start>:
+# CHECK-NEXT:    200cc: { call 0x8200ac <target_within_range_max> }
+# CHECK-NEXT:           { call 0x8200b8 <target_beyond_range> }
+# CHECK-NEXT:           { call 0x200b4 <__hexagon_thunk_target_within_range_min_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200bc <__hexagon_thunk_target_beyond_range_min_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200c4 <__hexagon_thunk_target_multiple_calls_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200c4 <__hexagon_thunk_target_multiple_calls_from_.text.thunk> }
+# CHECK-NEXT:           { call 0x200ec <target_close> }
+
+# CHECK:      <target_close>:
+# CHECK-NEXT:    200ec: { jumpr r31 }
+
+## Verify targets at maximum positive range (direct calls, no thunks needed)
+# CHECK:      <target_within_range_max>:
+# CHECK-NEXT:  8200ac: { jumpr r31 }
+
+# CHECK:      <target_beyond_range>:
+# CHECK-NEXT:  8200b8: { call 0x8200ac <target_within_range_max> }
+# CHECK-NEXT:          { jumpr r31 }
+
+# CHECK:      <target_multiple_calls>:
+# CHECK-NEXT:  9200c0: { jumpr r31 }
+
+## Verify targets in negative section and thunk for calling back to main section
+# CHECK:      <__hexagon_thunk__from_.text.thunk>:
+# CHECK-NEXT:  9200c4: { immext(#0xff700000)
+# CHECK-NEXT:            jump 0x200cc <_start> }
+
+# CHECK:      <target_within_range_min>:
+# CHECK-NEXT:  9200cc: { call 0x9200c4 <__hexagon_thunk__from_.text.thunk> }
+# CHECK-NEXT:          { jumpr r31 }
+
+# CHECK:      <target_beyond_range_min>:
+# CHECK-NEXT: 11200c8: { jumpr r31 }
diff --git a/lld/test/ELF/hexagon-thunk-range-gdplt.s b/lld/test/ELF/hexagon-thunk-range-gdplt.s
new file mode 100644
index 000000000000..77fd0e575456
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-gdplt.s
@@ -0,0 +1,95 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld -shared main.o -o test.so
+# RUN: llvm-objdump -d --no-show-raw-insn test.so | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_GD_PLT_B22_PCREL relocations.
+## Same ±8MB range as regular calls.
+
+#--- main.s
+.globl _start
+.type _start, @function
+_start:
+  ## Setup for TLS Global Dynamic calls
+  r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+
+  ## Test TLS GD PLT calls
+  r0 = add(r2,##tls_var_close@GDGOT)
+  call tls_var_close@GDPLT
+
+  r0 = add(r2,##tls_var_far@GDGOT)
+  call tls_var_far@GDPLT
+
+  jumpr r31
+
+.skip 0x400000
+
+more_code:
+  r0 = add(r2,##tls_var_distant@GDGOT)
+  call tls_var_distant@GDPLT
+  jumpr r31
+
+## TLS variables in .tdata section
+.section .tdata,"awT",@progbits
+.globl tls_var_close, tls_var_far, tls_var_distant
+.type tls_var_close, @object
+.type tls_var_far, @object
+.type tls_var_distant, @object
+
+tls_var_close:
+  .word 0x1234
+
+tls_var_far:
+  .word 0x5678
+
+tls_var_distant:
+  .word 0x9abc
+
+# CHECK: Disassembly of section .text:
+# CHECK:     <_start>:
+# CHECK-NEXT:   102d4:  { immext(#0x420100)
+# CHECK-NEXT:      r2 = add(pc,##0x420130) }
+# CHECK-NEXT:    { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10018) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10010) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+# CHECK:     <more_code>:
+# CHECK-NEXT:   4102f8:  { immext(#0xfffeffc0)
+# CHECK-NEXT:      r0 = add(r2,##-0x10008) }
+# CHECK-NEXT:    { call 0x410360 <__tls_get_addr@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+## Verify PLT entries are created for TLS
+# CHECK: Disassembly of section .plt:
+# CHECK:      <.plt>:
+# CHECK-NEXT:   410310:  { immext(#0x200c0)
+# CHECK-NEXT:      r28 = add(pc,##0x200f4) }
+# CHECK-NEXT:    { r14 -= add(r28,#0x10)
+# CHECK-NEXT:      r15 = memw(r28+#0x8)
+# CHECK-NEXT:      r28 = memw(r28+#0x4) }
+# CHECK-NEXT:    { r14 = asr(r14,#0x2)
+# CHECK-NEXT:      jumpr r28 }
+# CHECK-NEXT:    { trap0(#0xdb) }
+
+# CHECK:      <tls_var_far@plt>:
+# CHECK-NEXT:   410340:  { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200d8) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <tls_var_distant@plt>:
+# CHECK-NEXT:   410350:  { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200cc) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <__tls_get_addr@plt>:
+# CHECK-NEXT:   410360: { immext(#0x200c0)
+# CHECK-NEXT:      r14 = add(pc,##0x200c0) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
diff --git a/lld/test/ELF/hexagon-thunk-range-plt.s b/lld/test/ELF/hexagon-thunk-range-plt.s
new file mode 100644
index 000000000000..3a8f50b681d8
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunk-range-plt.s
@@ -0,0 +1,75 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf external.s -o external.o
+# RUN: ld.lld -shared external.o -soname external.so -o external.so
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf main.s -o main.o
+# RUN: ld.lld main.o external.so -o test
+# RUN: llvm-objdump -d --no-show-raw-insn test | FileCheck %s
+
+## Test thunk range scenarios for Hexagon R_HEX_PLT_B22_PCREL relocations.
+## PLT calls use the same ±8MB range as regular calls but go through PLT entries.
+## This test verifies thunk generation for PLT calls at range boundaries.
+
+#--- external.s
+.globl extern_within_range, extern_beyond_range, extern_close
+.type extern_within_range, @function
+.type extern_beyond_range, @function
+.type extern_close, @function
+
+extern_within_range:
+  jumpr r31
+
+extern_beyond_range:
+  jumpr r31
+
+extern_close:
+  jumpr r31
+
+#--- main.s
+.globl _start
+.type _start, @function
+_start:
+  ## Test PLT calls to external functions at various ranges
+  call extern_within_range@PLT
+  call extern_beyond_range@PLT
+  call extern_close@PLT
+  jumpr r31
+
+.skip 0x200000
+
+# CHECK: Disassembly of section .text:
+# CHECK:     <_start>:
+# CHECK-NEXT:  2021c:  { call 0x220250 <extern_within_range@plt> }
+# CHECK-NEXT:    { call 0x220260 <extern_beyond_range@plt> }
+# CHECK-NEXT:    { call 0x220270 <extern_close@plt> }
+# CHECK-NEXT:    { jumpr r31 }
+
+## Verify PLT header and entries are created with exact addresses
+# CHECK: Disassembly of section .plt:
+# CHECK:      <.plt>:
+# CHECK-NEXT:   220230:  { immext(#0x20080)
+# CHECK-NEXT:      r28 = add(pc,##0x200b8) }
+# CHECK-NEXT:    { r14 -= add(r28,#0x10)
+# CHECK-NEXT:      r15 = memw(r28+#0x8)
+# CHECK-NEXT:      r28 = memw(r28+#0x4) }
+# CHECK-NEXT:    { r14 = asr(r14,#0x2)
+# CHECK-NEXT:      jumpr r28 }
+# CHECK-NEXT:    { trap0(#0xdb) }
+
+# CHECK:      <extern_within_range@plt>:
+# CHECK-NEXT:   220250:  { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x200a8) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <extern_beyond_range@plt>:
+# CHECK-NEXT:   220260: { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x2009c) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
+
+# CHECK:      <extern_close@plt>:
+# CHECK-NEXT:   220270:  { immext(#0x20080)
+# CHECK-NEXT:      r14 = add(pc,##0x20090) }
+# CHECK-NEXT:    { r28 = memw(r14+#0x0) }
+# CHECK-NEXT:    { jumpr r28 }
diff --git a/lld/test/ELF/hexagon-thunks-packets.s b/lld/test/ELF/hexagon-thunks-packets.s
new file mode 100644
index 000000000000..c8aaad4341ff
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunks-packets.s
@@ -0,0 +1,122 @@
+# REQUIRES: hexagon
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-linux-musl %s -o %t.o
+# RUN: ld.lld %t.o -o %t
+# RUN: llvm-objdump -d %t 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-NONPIC,CHECK %s
+# RUN: llvm-mc -filetype=obj \
+# RUN:         -triple=hexagon-unknown-linux-musl %s -o %t.o
+# RUN: ld.lld --pie %t.o -o %t
+# RUN: llvm-objdump -d %t 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-PIC,CHECK %s
+
+## Packets with pc-relative relocations are more interesting because
+## the offset must be relative to the start of the source, destination
+## packets and not necessarily the instruction word containing the jump/call.
+
+# CHECK:  Disassembly of section .text:
+
+# CHECK-NONPIC: 000200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>:
+# CHECK-NONPIC: { immext(#0x1000040)
+# CHECK-NONPIC:   jump 0x1020110 <myfn_a> }
+
+# CHECK-PIC:    00010150 <__hexagon_thunk_myfn_a_from_.text.thunk>:
+# CHECK-PIC-NEXT:    { immext(#0x1000040)
+# CHECK-PIC-NEXT:      r14 = add(pc,##0x1000060) }
+# CHECK-PIC-NEXT:    { jumpr r14 }
+
+# CHECK-NONPIC: 000200bc <myfn_b>:
+# CHECK-NONPIC: { jumpr r31 }
+# CHECK-PIC:    0001015c <myfn_b>:
+# CHECK-PIC:    { jumpr r31 }
+    .globl myfn_b
+    .type  myfn_b, @function
+myfn_b:
+    jumpr r31
+    .size  myfn_b, .-myfn_b
+
+# CHECK-PIC:    00010160 <main>:
+    .globl main
+    .type  main, @function
+main:
+    { r0 = #0
+      call myfn_a }
+# CHECK-PIC:      { call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC:   { call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:       r0 = #0x0 }
+    call myfn_a
+# CHECK-PIC:    call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC: call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+    call myfn_b
+# CHECK-PIC-NEXT:    call 0x1015c <myfn_b>
+# CHECK-NONPIC-NEXT: call 0x200bc <myfn_b>
+
+    { r2 = add(r0, r1)
+      if (p0) call #myfn_b
+      if (!p0) call #myfn_a }
+# CHECK-PIC-NEXT:     { if (p0) call 0x1015c <myfn_b>
+# CHECK-PIC-NEXT:       if (!p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:  { if (p0) call 0x200bc <myfn_b>
+# CHECK-NONPIC-NEXT:    if (!p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+
+# CHECK-NEXT:       r2 = add(r0,r1) }
+
+    { r2 = add(r0, r1)
+      if (p0) call #myfn_a
+      if (!p0) call #myfn_a }
+# CHECK-PIC-NEXT:  { if (p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-PIC-NEXT:    if (!p0) call 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:  { if (p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NONPIC-NEXT:    if (!p0) call 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:           r2 = add(r0,r1) }
+
+    { r2 = add(r0, r1)
+      r1 = r4
+      r4 = r5
+      if (r0 == #0) jump:t #myfn_a }
+# CHECK-PIC-NEXT:     { if (r0==#0) jump:t 0x10150
+# CHECK-NONPIC-NEXT:  { if (r0==#0) jump:t 0x200b4
+# CHECK-NEXT:    r2 = add(r0,r1)
+# CHECK-NEXT:    r1 = r4; r4 = r5 }
+
+    { r2 = add(r0, r1)
+      r4 = r5
+      if (r0 <= #0) jump:t #myfn_a
+      p1 = cmp.eq(r0, #0); if (p1.new) jump:nt #myfn_a }
+# CHECK-NONPIC-NEXT:  { if (r0<=#0) jump:t 0x200b4
+# CHECK-NONPIC-NEXT:    p1 = cmp.eq(r0,#0x0); if (p1.new) jump:nt 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-PIC-NEXT:     { if (r0<=#0) jump:t 0x10150
+# CHECK-PIC-NEXT:       p1 = cmp.eq(r0,#0x0); if (p1.new) jump:nt 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk>
+# CHECK-NEXT:           r2 = add(r0,r1)
+# CHECK-NEXT:           r4 = r5 }
+
+    {r0 = #0; jump #myfn_a}
+# CHECK-PIC-NEXT:    { r0 = #0x0 ; jump 0x10150 <__hexagon_thunk_myfn_a_from_.text.thunk> }
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x200b4 <__hexagon_thunk_myfn_a_from_.text.thunk> }
+    {r0 = #0; jump #myfn_b}
+# CHECK-PIC-NEXT:    { r0 = #0x0 ; jump 0x1015c <myfn_b> }
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x200bc <myfn_b> }
+    jumpr r31
+    .size   main, .-main
+
+    .section .text.foo
+    .skip 0x1000000
+
+    .globl myfn_a
+    .type  myfn_a, @function
+myfn_a:
+    {r0 = #0; jump #myfn_b}
+    jumpr r31
+    .size  myfn_a, .-myfn_a
+
+# CHECK-NONPIC: 01020110 <myfn_a>:
+# CHECK-NONPIC-NEXT: { r0 = #0x0 ; jump 0x1020118 <__hexagon_thunk_myfn_b_from_.text.thunk> }
+# CHECK-NONPIC-NEXT: { jumpr r31 }
+
+# CHECK-NONPIC: 01020118 <__hexagon_thunk_myfn_b_from_.text.thunk>:
+# CHECK-NONPIC-NEXT: { immext(#0xfeffff80)
+# CHECK-NONPIC-NEXT:   jump 0x200bc <myfn_b> }
+
+# CHECK-PIC:    010101b8 <__hexagon_thunk_myfn_b_from_.text.thunk>:
+# CHECK-PIC-NEXT:    { immext(#0xfeffff80)
+# CHECK-PIC-NEXT:      r14 = add(pc,##0xfeffffa4) }
+# CHECK-PIC-NEXT:    { jumpr r14 }
diff --git a/lld/test/ELF/hexagon-thunks.s b/lld/test/ELF/hexagon-thunks.s
new file mode 100644
index 000000000000..211074e1784b
--- /dev/null
+++ b/lld/test/ELF/hexagon-thunks.s
@@ -0,0 +1,53 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %t/a.s -o %t/a.o
+# RUN: ld.lld -T %t/lds %t/a.o -o %t/a
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-NONPIC,CHECK %s
+# RUN: llvm-mc -filetype=obj \
+# RUN:         -triple=hexagon-unknown-elf %t/a.s -o %t/a.o
+
+# RUN: ld.lld -T %t/lds --pie %t/a.o -o %t/a
+# RUN: llvm-objdump -d --no-show-raw-insn %t/a 2>&1 | \
+# RUN:     FileCheck --check-prefixes=CHECK-PIC,CHECK %s
+
+#--- a.s
+.section .text_low, "ax", %progbits
+    .globl main
+    .type  main, @function
+main:
+    call myfn
+    jumpr r31
+    .size   main, .-main
+
+.section .text_high, "ax", %progbits
+    .globl myfn
+    .type  myfn, @function
+myfn:
+    jumpr r31
+    .size  myfn, .-myfn
+
+# CHECK:  Disassembly of section .text_low:
+
+# CHECK:             <__hexagon_thunk_myfn_from_.text.thunk>:
+# CHECK-NONPIC-NEXT:    200b4: { immext(#0x1000000)
+# CHECK-NONPIC-NEXT:             jump 0x10200bc <myfn> }
+# CHECK-PIC-NEXT:       200b4: { immext(#0x1000000)
+# CHECK-PIC-NEXT:               r14 = add(pc,##0x1000008) }
+# CHECK-PIC-NEXT:       { jumpr r14 }
+
+# CHECK-NONPIC:      <main>:
+# CHECK-NONPIC-NEXT:   200bc: { call 0x200b4 <__hexagon_thunk_myfn_from_.text.thunk> }
+# CHECK-PIC:         <main>:
+# CHECK-PIC-NEXT:      200c0: { call 0x200b4 <__hexagon_thunk_myfn_from_.text.thunk> }
+# CHECK-NEXT:                 { jumpr r31 }
+
+# CHECK:  Disassembly of section .text_high:
+# CHECK:    <myfn>:
+# CHECK-NEXT: 10200bc: { jumpr r31 }
+
+#--- lds
+SECTIONS {
+  .text_low 0x200b4: { *(.text_low) }
+  .text_high 0x10200bc : { *(.text_high) }
+}
diff --git a/lld/test/ELF/hexagon-tls-allocateaux-multiple.s b/lld/test/ELF/hexagon-tls-allocateaux-multiple.s
new file mode 100644
index 000000000000..a77cc822e67d
--- /dev/null
+++ b/lld/test/ELF/hexagon-tls-allocateaux-multiple.s
@@ -0,0 +1,36 @@
+# REQUIRES: hexagon
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf a.s -o a.o
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf b.s -o b.o
+# RUN: ld.lld -shared a.o b.o -o out.so
+# RUN: llvm-readobj -r out.so | FileCheck --check-prefix=RELOC %s
+
+#--- a.s
+.globl _start
+.type _start, @function
+
+_start:
+  r2 = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
+  r0 = add(r2,##tls_var@GDGOT)
+  call tls_var@GDPLT
+  jumpr r31
+
+.section .tdata,"awT",@progbits
+.globl tls_var
+.type tls_var, @object
+tls_var:
+  .word 0x1234
+
+#--- b.s
+.globl other_func
+.type other_func, @function
+
+other_func:
+  ## Direct call to __tls_get_addr - this creates another path that may
+  ## try to allocate auxiliary data for the same symbol
+  call __tls_get_addr
+  jumpr r31
+
+# RELOC:      Section ({{.*}}) .rela.plt {
+# RELOC:        R_HEX_JMP_SLOT __tls_get_addr 0x0
+# RELOC:      }
diff --git a/lld/test/ELF/hexagon-tls-gd-xform.s b/lld/test/ELF/hexagon-tls-gd-xform.s
index 65aeb118fcb3..ade54e8a16fa 100644
--- a/lld/test/ELF/hexagon-tls-gd-xform.s
+++ b/lld/test/ELF/hexagon-tls-gd-xform.s
@@ -18,10 +18,10 @@
 _start:
 .ifdef GDPLT
                         call x@gdplt
-# CHECK_GDPLT:  101ec: { call 0x10220 }
+# CHECK_GDPLT:  101ec: { call 0x10220 <__tls_get_addr@plt> }
 .else
                   call x
-# CHECK:  101b8: { call 0x101e0 }
+# CHECK:  101b8: { call 0x101e0 <x@plt> }
 .endif
 
 # CHECK_GDPLT:        10220: { immext(#0x20040)
diff --git a/lld/test/ELF/loongarch-pc-hi20-lo12-got.s b/lld/test/ELF/loongarch-pc-hi20-lo12-got.s
new file mode 100644
index 000000000000..acd94007d0ff
--- /dev/null
+++ b/lld/test/ELF/loongarch-pc-hi20-lo12-got.s
@@ -0,0 +1,145 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 a.s -o a.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 unpaired.s -o unpaired.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 lone-ldr.s -o lone-ldr.o
+
+# RUN: ld.lld a.o -T within-range.t -o a
+# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s
+
+## This test verifies the encoding when the register $a0 is used.
+# CHECK:      pcalau12i $a0, 0
+# CHECK-NEXT: addi.d    $a0, $a0, -2048
+
+## PCALAU12I contains a nonzero addend, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a1, 2
+# CHECK-NEXT: ld.d      $a1, $a1, -2048
+
+## LD contains a nonzero addend, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a2, 2
+# CHECK-NEXT: ld.d      $a2, $a2, -2040
+
+## PCALAU12I and LD use different registers, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a3, 2
+# CHECK-NEXT: ld.d      $a4, $a3, -2048
+
+## PCALAU12I and LD use different registers, no relaxations should be applied.
+# CHECK-NEXT: pcalau12i $a5, 2
+# CHECK-NEXT: ld.d      $a5, $a6, -2048
+
+# RUN: ld.lld a.o -T underflow-range.t -o a-underflow
+# RUN: llvm-objdump -d --no-show-raw-insn a-underflow | FileCheck --check-prefix=OUTRANGE %s
+
+# RUN: ld.lld a.o -T overflow-range.t -o a-overflow
+# RUN: llvm-objdump -d --no-show-raw-insn a-overflow | FileCheck --check-prefix=OUTRANGE %s
+
+# OUTRANGE:      pcalau12i $a0, 1
+# OUTRANGE-NEXT: ld.d      $a0, $a0, 0
+
+## Relocations do not appear in pairs, no relaxations should be applied.
+# RUN: ld.lld unpaired.o -T within-range.t  -o unpaired
+# RUN: llvm-objdump --no-show-raw-insn -d unpaired | FileCheck --check-prefix=UNPAIRED %s
+
+# UNPAIRED:         pcalau12i $a0, 2
+# UNPAIRED-NEXT:    b         8
+# UNPAIRED-NEXT:    pcalau12i $a0, 2
+# UNPAIRED:         ld.d      $a0, $a0, -2048
+
+## Relocations do not appear in pairs, no relaxations should be applied.
+# RUN: ld.lld lone-ldr.o -T within-range.t -o lone-ldr
+# RUN: llvm-objdump --no-show-raw-insn -d lone-ldr | FileCheck --check-prefix=LONE-LDR %s
+
+# LONE-LDR:         ld.d   $a0, $a0, -2048
+
+## 32-bit code is mostly the same. We only test a few variants.
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 a.32.s -o a.32.o
+# RUN: ld.lld a.32.o -T within-range.t -o a32
+# RUN: llvm-objdump -d --no-show-raw-insn a32 | FileCheck --check-prefix=CHECK32 %s
+
+## This test verifies the encoding when the register $a0 is used.
+# CHECK32:      pcalau12i $a0, 0
+# CHECK32-NEXT: addi.w    $a0, $a0, -2048
+
+
+## This linker script ensures that .rodata and .text are sufficiently close to
+## each other so that the pcalau12i + ld pair can be relaxed to pcalau12i + add.
+#--- within-range.t
+SECTIONS {
+ .rodata 0x1800: { *(.rodata) }
+ .text   0x2800: { *(.text) }
+ .got    0x3800: { *(.got) }
+}
+
+## This linker script ensures that .rodata and .text are sufficiently far apart
+## so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.
+#--- underflow-range.t
+SECTIONS {
+ .rodata 0x800-4: { *(.rodata) }
+ .got    0x80002000: { *(.got) }
+ .text   0x80001000: { *(.text) }  /* (0x800-4)+2GB+0x800+4 */
+}
+
+#--- overflow-range.t
+SECTIONS {
+ .text   0x1000: { *(.text) }
+ .got    0x2000: { *(.got) }
+ .rodata 0x80000800 : { *(.rodata) }  /* 0x1000+2GB-0x800 */
+}
+
+#--- a.s
+## Symbol 'x' is nonpreemptible, the optimization should be applied.
+.rodata
+.hidden x
+x:
+.word 10
+
+.text
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(x)
+  ld.d      $a0, $a0, %got_pc_lo12(x)
+  pcalau12i $a1, %got_pc_hi20(x+1)
+  ld.d      $a1, $a1, %got_pc_lo12(x)
+  pcalau12i $a2, %got_pc_hi20(x)
+  ld.d      $a2, $a2, %got_pc_lo12(x+8)
+  pcalau12i $a3, %got_pc_hi20(x)
+  ld.d      $a4, $a3, %got_pc_lo12(x)
+  pcalau12i $a5, %got_pc_hi20(x)
+  ld.d      $a5, $a6, %got_pc_lo12(x)
+
+#--- unpaired.s
+.text
+.hidden x
+x:
+  nop
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(x)
+  b L
+  pcalau12i $a0, %got_pc_hi20(x)
+L:
+  ld.d      $a0, $a0, %got_pc_lo12(x)
+
+#--- lone-ldr.s
+.text
+.hidden x
+x:
+  nop
+.global _start
+_start:
+  ld.d     $a0, $a0, %got_pc_lo12(x)
+
+
+#--- a.32.s
+## Symbol 'x' is nonpreemptible, the optimization should be applied.
+.rodata
+.hidden x
+x:
+.word 10
+
+.text
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(x)
+  ld.w      $a0, $a0, %got_pc_lo12(x)
diff --git a/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s b/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s
index a417d89e9fa2..08d5d3e950d8 100644
--- a/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s
+++ b/lld/test/ELF/loongarch-relax-pc-hi20-lo12.s
@@ -1,22 +1,23 @@
 # REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t && cd %t
 
-# RUN: llvm-mc --filetype=obj --triple=loongarch32 -mattr=+relax %s -o %t.32.o
-# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s -o %t.64.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 -mattr=+relax a.s -o a.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax a.s -o a.64.o
 
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.32.o -o %t.32
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.64.o -o %t.64
-# RUN: llvm-objdump -td --no-show-raw-insn %t.32 | FileCheck --check-prefixes=RELAX %s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.64 | FileCheck --check-prefixes=RELAX %s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.32.o -o a.32
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.64.o -o a.64
+# RUN: llvm-objdump -td --no-show-raw-insn a.32 | FileCheck --check-prefixes=RELAX %s
+# RUN: llvm-objdump -td --no-show-raw-insn a.64 | FileCheck --check-prefixes=RELAX %s
 
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.32.o -shared -o %t.32s
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 %t.64.o -shared -o %t.64s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.32s | FileCheck --check-prefixes=RELAX %s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.64s | FileCheck --check-prefixes=RELAX %s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.32.o -shared -o a.32s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 a.64.o -shared -o a.64s
+# RUN: llvm-objdump -td --no-show-raw-insn a.32s | FileCheck --check-prefixes=RELAX %s
+# RUN: llvm-objdump -td --no-show-raw-insn a.64s | FileCheck --check-prefixes=RELAX %s
 
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 %t.32.o -o %t.32o
-# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 %t.64.o -o %t.64o
-# RUN: llvm-objdump -td --no-show-raw-insn %t.32o | FileCheck --check-prefixes=NORELAX32 %s
-# RUN: llvm-objdump -td --no-show-raw-insn %t.64o | FileCheck --check-prefixes=NORELAX64 %s
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 a.32.o -o a.32o
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x410000 a.64.o -o a.64o
+# RUN: llvm-objdump -td --no-show-raw-insn a.32o | FileCheck --check-prefixes=NORELAX32 %s
+# RUN: llvm-objdump -td --no-show-raw-insn a.64o | FileCheck --check-prefixes=NORELAX64 %s
 
 # RELAX-LABEL: <_start>:
 ## offset = 0x14000 - 0x10000 = 4096<<2
@@ -30,25 +31,46 @@
 ## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0
 # NORELAX32-NEXT:  10000:  pcalau12i     $a0, 1024
 # NORELAX32-NEXT:          addi.w        $a0, $a0, 0
+## Not relaxation, convertion to PCRel.
 # NORELAX32-NEXT:          pcalau12i     $a0, 1024
-# NORELAX32-NEXT:          ld.w          $a0, $a0, 4
+# NORELAX32-NEXT:          addi.w        $a0, $a0, 0
 # NORELAX32-NEXT:          pcalau12i     $a0, 1024
 # NORELAX32-NEXT:          addi.w        $a0, $a0, 0
 # NORELAX32-NEXT:          pcalau12i     $a0, 1024
-# NORELAX32-NEXT:          ld.w          $a0, $a0, 4
+# NORELAX32-NEXT:          addi.w        $a0, $a0, 0
 
 # NORELAX64-LABEL: <_start>:
 ## offset exceed range of pcaddi
 ## offset = 0x410000 - 0x10000: 0x400 pages, page offset 0
 # NORELAX64-NEXT:  10000:  pcalau12i     $a0, 1024
 # NORELAX64-NEXT:          addi.d        $a0, $a0, 0
+## Not relaxation, convertion to PCRel.
 # NORELAX64-NEXT:          pcalau12i     $a0, 1024
-# NORELAX64-NEXT:          ld.d          $a0, $a0, 8
+# NORELAX64-NEXT:          addi.d        $a0, $a0, 0
 # NORELAX64-NEXT:          pcalau12i     $a0, 1024
 # NORELAX64-NEXT:          addi.d        $a0, $a0, 0
 # NORELAX64-NEXT:          pcalau12i     $a0, 1024
-# NORELAX64-NEXT:          ld.d          $a0, $a0, 8
+# NORELAX64-NEXT:          addi.d        $a0, $a0, 0
+
+
+## GOT references with non-zero addends. No relaxation.
+# RUN: llvm-mc --filetype=obj --triple=loongarch32 -mattr=+relax nonzero.s -o nonzero.32.o
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax nonzero.s -o nonzero.64.o
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 nonzero.32.o -o nonzero.32
+# RUN: ld.lld --section-start=.text=0x10000 --section-start=.data=0x14000 nonzero.64.o -o nonzero.64
+# RUN: llvm-objdump -td --no-show-raw-insn nonzero.32 | FileCheck --check-prefixes=NONZERO32 %s
+# RUN: llvm-objdump -td --no-show-raw-insn nonzero.64 | FileCheck --check-prefixes=NONZERO64 %s
+
+# NONZERO32-LABEL: <_start>:
+# NONZERO32-NEXT:      10000:  pcalau12i $a0, 4
+# NONZERO32-NEXT:              ld.w      $a0, $a0, 8
+
+# NONZERO64-LABEL: <_start>:
+# NONZERO64-NEXT:      10000:  pcalau12i $a0, 4
+# NONZERO64-NEXT:              ld.d      $a0, $a0, 12
+
 
+#--- a.s
 .section .text
 .global _start
 _start:
@@ -60,3 +82,14 @@ _start:
 .section .data
 sym:
   .zero 4
+
+
+#--- nonzero.s
+.section .text
+.global _start
+_start:
+  la.got    $a0, sym+4
+
+.section .data
+sym:
+  .zero 4
diff --git a/lld/test/ELF/loongarch-relax-tlsdesc.s b/lld/test/ELF/loongarch-relax-tlsdesc.s
index 5f4368343471..025cbc09fbdd 100644
--- a/lld/test/ELF/loongarch-relax-tlsdesc.s
+++ b/lld/test/ELF/loongarch-relax-tlsdesc.s
@@ -9,7 +9,6 @@
 # RUN: llvm-readobj -r -x .got a.64.so | FileCheck --check-prefix=GD64-RELA %s
 # RUN: llvm-objdump --no-show-raw-insn -dr -h a.64.so | FileCheck %s --check-prefix=GD64
 
-## FIXME: IE/LE relaxation have not yet been implemented, --relax/--no-relax obtain the same results.
 ## Transition from TLSDESC to IE/LE. Also check --emit-relocs.
 # RUN: ld.lld -e 0 -z now --emit-relocs a.64.o c.64.o -o a.64.le
 # RUN: llvm-readobj -r -x .got a.64.le 2>&1 | FileCheck --check-prefix=LE64-RELA %s
@@ -73,25 +72,21 @@
 # LE64-RELA: could not find section '.got'
 
 ## a@tprel = 0x8
-# LE64:        20158: nop
+# LE64:        20158: ori     $a0, $zero, 8
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_LD a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          ori     $a0, $zero, 8
 # LE64-NEXT:            R_LARCH_TLS_DESC_CALL a
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
 # LE64-NEXT:          add.d   $a1, $a0, $tp
 
 ## b@tprel = 0x7ff
-# LE64:        2016c: nop
+# LE64:        20160: nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 b
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 b
 # LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_LD b
@@ -101,7 +96,7 @@
 
 ## c@tprel = 0x800
 ## Without R_LARCH_RELAX relocation. No relaxation.
-# LE64:        20180: nop
+# LE64:        20170: nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 c
 # LE64-NEXT:          addi.d  $t0, $zero, 0
 # LE64-NEXT:          nop
@@ -115,13 +110,11 @@
 # LE64-NEXT:          add.d   $a3, $a0, $tp
 
 ## d@tprel = 0x1000
-# LE64:        201a0: nop
+# LE64:        20190: lu12i.w $a0, 1
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 d
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          nop
 # LE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 d
 # LE64-NEXT:            R_LARCH_RELAX *ABS*
-# LE64-NEXT:          lu12i.w $a0, 1
 # LE64-NEXT:            R_LARCH_TLS_DESC_LD d
 # LE64-NEXT:          ori     $a0, $a0, 0
 # LE64-NEXT:            R_LARCH_TLS_DESC_CALL d
@@ -160,35 +153,31 @@
 # LE64-NORELAX-NEXT:          add.d   $a4, $a0, $tp
 
 # IE64-RELA:      .rela.dyn {
-# IE64-RELA-NEXT:   0x30408 R_LARCH_TLS_TPREL64 c 0x0
-# IE64-RELA-NEXT:   0x30410 R_LARCH_TLS_TPREL64 d 0x0
+# IE64-RELA-NEXT:   0x303F0 R_LARCH_TLS_TPREL64 c 0x0
+# IE64-RELA-NEXT:   0x303F8 R_LARCH_TLS_TPREL64 d 0x0
 # IE64-RELA-NEXT: }
 # IE64-RELA:      Hex dump of section '.got':
-# IE64-RELA-NEXT: 0x00030408 00000000 00000000 00000000 00000000 .
+# IE64-RELA-NEXT: 0x000303f0 00000000 00000000 00000000 00000000 .
 
-# IE64:   .got           00000010 0000000000030408
+# IE64:   .got           00000010 00000000000303f0
 
 ## a and b are optimized to use LE. c and d are optimized to IE.
 ## a@tprel = 0x8
-# IE64:        202c8: nop
+# IE64:        202c8: ori     $a0, $zero, 8
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          ori     $a0, $zero, 8
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL a
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
 # IE64-NEXT:          add.d   $a1, $a0, $tp
 
 ## b@tprel = 0x7ff
-# IE64:        202dc: nop
+# IE64:        202d0: nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 b
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 b
 # IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD b
@@ -196,9 +185,9 @@
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL b
 # IE64-NEXT:          add.d   $a2, $a0, $tp
 
-## &.got[c]-. = 0x30408 - 0x20300: 0x10 pages, page offset 0x408
+## &.got[c]-. = 0x303f0 - 0x202f0: 0x10 pages, page offset 0x3f0
 ## Without R_LARCH_RELAX relocation. No relaxation.
-# IE64:        202f0: nop
+# IE64:        202e0: nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 c
 # IE64-NEXT:          addi.d  $t0, $zero, 0
 # IE64-NEXT:          nop
@@ -207,20 +196,18 @@
 # IE64-NEXT:          pcalau12i $a0, 16
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD c
 # IE64-NEXT:          addi.d  $t0, $t0, 1
-# IE64-NEXT:          ld.d    $a0, $a0, 1032
+# IE64-NEXT:          ld.d    $a0, $a0, 1008
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL c
 # IE64-NEXT:          add.d   $a3, $a0, $tp
 
-## &.got[d]-. = 0x30408+8 - 0x20318: 0x10 pages, page offset 0x410
-# IE64:        20310: nop
+## &.got[d]-. = 0x303f0+8 - 0x20300: 0x10 pages, page offset 0x3f8
+# IE64:        20300: pcalau12i $a0, 16
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_HI20 d
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          nop
 # IE64-NEXT:            R_LARCH_TLS_DESC_PC_LO12 d
 # IE64-NEXT:            R_LARCH_RELAX *ABS*
-# IE64-NEXT:          pcalau12i $a0, 16
 # IE64-NEXT:            R_LARCH_TLS_DESC_LD d
-# IE64-NEXT:          ld.d    $a0, $a0, 1040
+# IE64-NEXT:          ld.d    $a0, $a0, 1016
 # IE64-NEXT:            R_LARCH_TLS_DESC_CALL d
 # IE64-NEXT:          add.d   $a4, $a0, $tp
 
diff --git a/lldb/docs/use/mcp.md b/lldb/docs/use/mcp.md
index 375c164fe771..b7474246b54f 100644
--- a/lldb/docs/use/mcp.md
+++ b/lldb/docs/use/mcp.md
@@ -75,7 +75,69 @@ Configuration example for [Visual Studio Code](https://code.visualstudio.com/doc
 }
 ```
 
-### Troubleshooting
+## Tools
+
+Tools are a primitive in the Model Context Protocol that enable servers to
+expose functionality to clients.
+
+LLDB's MCP integration exposes one tool, named `lldb_command` which allows the
+model to run the same commands a user would type in the LLDB command
+interpreter. It takes two arguments:
+
+1. The unique debugger ID as a number.
+2. The command and its arguments as a string.
+
+## Resources
+
+Resources are a primitive in the Model Context Protocol that allow servers to
+expose content that can be read by clients.
+
+LLDB's MCP integration exposes a resource for each debugger and target
+instance. Debugger resources are accessible using the following URI:
+
+```
+lldb://debugger/<debugger id>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1",
+      "mimeType": "application/json",
+      "text": "{\"debugger_id\":1,\"name\":\"debugger_1\",\"num_targets\":1}"
+    }
+  ]
+}
+```
+
+Debuggers can contain one or more targets, which are accessible using the
+following URI:
+
+```
+lldb://debugger/<debugger id>/target/<target idx>
+```
+
+Example output:
+
+```json
+{
+  "contents": [
+    {
+      "uri": "lldb://debugger/1/target/0",
+      "mimeType": "application/json",
+      "text": "{\"arch\":\"arm64-apple-macosx26.0.0\",\"debugger_id\":1,\"dummy\":false,\"path\":\"/bin/count\",\"platform\":\"host\",\"selected\":true,\"target_idx\":0}"
+    }
+  ]
+}
+```
+
+Note that unlike the debugger id, which is unique, the target index is not
+stable and may be reused when a target is removed and a new target is added.
+
+## Troubleshooting
 
 The MCP server uses the `Host` log channel. You can enable logging with the
 `log enable` command.
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index a4ff96e4158c..a47ffabdecd0 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1380,6 +1380,9 @@ def isAArch64SMEFA64(self):
     def isAArch64MTE(self):
         return self.isAArch64() and "mte" in self.getCPUInfo()
 
+    def isAArch64MTEStoreOnly(self):
+        return self.isAArch64() and "mtestoreonly" in self.getCPUInfo()
+
     def isAArch64GCS(self):
         return self.isAArch64() and "gcs" in self.getCPUInfo()
 
diff --git a/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h b/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h
index 35649b10ce45..a1aa7e1a69f3 100644
--- a/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h
+++ b/lldb/packages/Python/lldbsuite/test/make/libcxx-simulators-common/compressed_pair.h
@@ -4,6 +4,14 @@
 #include <type_traits>
 #include <utility> // for std::forward
 
+// COMPRESSED_PAIR_REV versions:
+// 0 -> Post-c88580c layout
+// 1 -> Post-27c83382d83dc layout
+// 2 -> Post-769c42f4a552a layout
+// 3 -> Post-f5e687d7bf49c layout
+// 4 -> padding-less no_unique_address-based layout (introduced in
+// 27c83382d83dc)
+
 namespace std {
 namespace __lldb {
 
@@ -13,7 +21,50 @@ namespace __lldb {
 #define _LLDB_NO_UNIQUE_ADDRESS [[__no_unique_address__]]
 #endif
 
-#if COMPRESSED_PAIR_REV == 0 // Post-c88580c layout
+// From libc++ datasizeof.h
+template <class _Tp> struct _FirstPaddingByte {
+  _LLDB_NO_UNIQUE_ADDRESS _Tp __v_;
+  char __first_padding_byte_;
+};
+
+template <class _Tp>
+inline const size_t __datasizeof_v =
+    __builtin_offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
+
+template <class _Tp>
+struct __lldb_is_final : public integral_constant<bool, __is_final(_Tp)> {};
+
+// The legacy layout has been patched, see
+// https://github.com/llvm/llvm-project/pull/142516.
+#if COMPRESSED_PAIR_REV == 1
+template <class _ToPad> class __compressed_pair_padding {
+  char __padding_[((is_empty<_ToPad>::value &&
+                    !__lldb_is_final<_ToPad>::value) ||
+                   is_reference<_ToPad>::value)
+                      ? 0
+                      : sizeof(_ToPad) - __datasizeof_v<_ToPad>];
+};
+#elif COMPRESSED_PAIR_REV > 1 && COMPRESSED_PAIR_REV < 4
+template <class _ToPad>
+inline const bool __is_reference_or_unpadded_object =
+    (std::is_empty<_ToPad>::value && !__lldb_is_final<_ToPad>::value) ||
+    sizeof(_ToPad) == __datasizeof_v<_ToPad>;
+
+template <class _Tp>
+inline const bool __is_reference_or_unpadded_object<_Tp &> = true;
+
+template <class _Tp>
+inline const bool __is_reference_or_unpadded_object<_Tp &&> = true;
+
+template <class _ToPad, bool _Empty = __is_reference_or_unpadded_object<_ToPad>>
+class __compressed_pair_padding {
+  char __padding_[sizeof(_ToPad) - __datasizeof_v<_ToPad>] = {};
+};
+
+template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
+#endif // COMPRESSED_PAIR_REV == 1
+
+#if COMPRESSED_PAIR_REV == 0
 struct __value_init_tag {};
 struct __default_init_tag {};
 
@@ -59,49 +110,6 @@ class __compressed_pair : private __compressed_pair_elem<_T1, 0>,
   _T1 &first() { return static_cast<_Base1 &>(*this).__get(); }
 };
 #elif COMPRESSED_PAIR_REV == 1 || COMPRESSED_PAIR_REV == 2
-// From libc++ datasizeof.h
-template <class _Tp> struct _FirstPaddingByte {
-  _LLDB_NO_UNIQUE_ADDRESS _Tp __v_;
-  char __first_padding_byte_;
-};
-
-template <class _Tp>
-inline const size_t __datasizeof_v =
-    __builtin_offsetof(_FirstPaddingByte<_Tp>, __first_padding_byte_);
-
-template <class _Tp>
-struct __lldb_is_final : public integral_constant<bool, __is_final(_Tp)> {};
-
-// The legacy layout has been patched, see
-// https://github.com/llvm/llvm-project/pull/142516.
-#if COMPRESSED_PAIR_REV == 1
-template <class _ToPad> class __compressed_pair_padding {
-  char __padding_[((is_empty<_ToPad>::value &&
-                    !__lldb_is_final<_ToPad>::value) ||
-                   is_reference<_ToPad>::value)
-                      ? 0
-                      : sizeof(_ToPad) - __datasizeof_v<_ToPad>];
-};
-#else
-template <class _ToPad>
-inline const bool __is_reference_or_unpadded_object =
-    (std::is_empty<_ToPad>::value && !__lldb_is_final<_ToPad>::value) ||
-    sizeof(_ToPad) == __datasizeof_v<_ToPad>;
-
-template <class _Tp>
-inline const bool __is_reference_or_unpadded_object<_Tp &> = true;
-
-template <class _Tp>
-inline const bool __is_reference_or_unpadded_object<_Tp &&> = true;
-
-template <class _ToPad, bool _Empty = __is_reference_or_unpadded_object<_ToPad>>
-class __compressed_pair_padding {
-  char __padding_[sizeof(_ToPad) - __datasizeof_v<_ToPad>] = {};
-};
-
-template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
-#endif
-
 #define _LLDB_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)              \
   [[__gnu__::__aligned__(                                                      \
       alignof(T2))]] _LLDB_NO_UNIQUE_ADDRESS T1 Initializer1;                  \
@@ -119,6 +127,27 @@ template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
   _LLDB_NO_UNIQUE_ADDRESS T3 Initializer3;                                     \
   _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T3> __padding3_;
 #elif COMPRESSED_PAIR_REV == 3
+#define _LLDB_COMPRESSED_PAIR(T1, Initializer1, T2, Initializer2)              \
+  struct {                                                                     \
+    [[__gnu__::__aligned__(                                                    \
+        alignof(T2))]] _LLDB_NO_UNIQUE_ADDRESS T1 Initializer1;                \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T1> __padding1_;         \
+    _LLDB_NO_UNIQUE_ADDRESS T2 Initializer2;                                   \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T2> __padding2_;         \
+  }
+
+#define _LLDB_COMPRESSED_TRIPLE(T1, Initializer1, T2, Initializer2, T3,        \
+                                Initializer3)                                  \
+  struct {                                                                     \
+    [[using __gnu__: __aligned__(alignof(T2)),                                 \
+      __aligned__(alignof(T3))]] _LLDB_NO_UNIQUE_ADDRESS T1 Initializer1;      \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T1> __padding1_;         \
+    _LLDB_NO_UNIQUE_ADDRESS T2 Initializer2;                                   \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T2> __padding2_;         \
+    _LLDB_NO_UNIQUE_ADDRESS T3 Initializer3;                                   \
+    _LLDB_NO_UNIQUE_ADDRESS __compressed_pair_padding<T3> __padding3_;         \
+  }
+#elif COMPRESSED_PAIR_REV == 4
 #define _LLDB_COMPRESSED_PAIR(T1, Name1, T2, Name2)                            \
   _LLDB_NO_UNIQUE_ADDRESS T1 Name1;                                            \
   _LLDB_NO_UNIQUE_ADDRESS T2 Name2
@@ -127,7 +156,7 @@ template <class _ToPad> class __compressed_pair_padding<_ToPad, true> {};
   _LLDB_NO_UNIQUE_ADDRESS T1 Name1;                                            \
   _LLDB_NO_UNIQUE_ADDRESS T2 Name2;                                            \
   _LLDB_NO_UNIQUE_ADDRESS T3 Name3
-#endif
+#endif // COMPRESSED_PAIR_REV == 3
 } // namespace __lldb
 } // namespace std
 
diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index a1de895c0ba9..c0b10797e506 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/WindowsError.h"
 #include <algorithm>
+#include <atomic>
 #include <cassert>
 #include <ctime>
 #include <io.h>
@@ -222,7 +223,7 @@ MainLoopWindows::RegisterReadObject(const IOObjectSP &object_sp,
 
   if (m_read_fds.find(waitable_handle) != m_read_fds.end()) {
     error = Status::FromErrorStringWithFormat(
-        "File descriptor %d already monitored.", waitable_handle);
+        "File descriptor %p already monitored.", waitable_handle);
     return nullptr;
   }
 
@@ -234,7 +235,7 @@ MainLoopWindows::RegisterReadObject(const IOObjectSP &object_sp,
   } else {
     DWORD file_type = GetFileType(waitable_handle);
     if (file_type != FILE_TYPE_PIPE) {
-      error = Status::FromErrorStringWithFormat("Unsupported file type %d",
+      error = Status::FromErrorStringWithFormat("Unsupported file type %ld",
                                                 file_type);
       return nullptr;
     }
diff --git a/lldb/source/Host/windows/PipeWindows.cpp b/lldb/source/Host/windows/PipeWindows.cpp
index 0b495fff69df..001396fafde0 100644
--- a/lldb/source/Host/windows/PipeWindows.cpp
+++ b/lldb/source/Host/windows/PipeWindows.cpp
@@ -279,7 +279,8 @@ llvm::Expected<size_t> PipeWindows::Read(void *buf, size_t size,
     return Status(failure_error, eErrorTypeWin32).takeError();
 
   DWORD timeout_msec =
-      timeout ? ceil<std::chrono::milliseconds>(*timeout).count() : INFINITE;
+      timeout ? std::chrono::ceil<std::chrono::milliseconds>(*timeout).count()
+              : INFINITE;
   DWORD wait_result =
       ::WaitForSingleObject(m_read_overlapped.hEvent, timeout_msec);
   if (wait_result != WAIT_OBJECT_0) {
@@ -324,7 +325,8 @@ llvm::Expected<size_t> PipeWindows::Write(const void *buf, size_t size,
     return Status(failure_error, eErrorTypeWin32).takeError();
 
   DWORD timeout_msec =
-      timeout ? ceil<std::chrono::milliseconds>(*timeout).count() : INFINITE;
+      timeout ? std::chrono::ceil<std::chrono::milliseconds>(*timeout).count()
+              : INFINITE;
   DWORD wait_result =
       ::WaitForSingleObject(m_write_overlapped.hEvent, timeout_msec);
   if (wait_result != WAIT_OBJECT_0) {
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
index 2529e78f78bc..633b860b5784 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangASTImporter.cpp
@@ -1053,6 +1053,16 @@ ClangASTImporter::MapCompleter::~MapCompleter() = default;
 
 llvm::Expected<Decl *>
 ClangASTImporter::ASTImporterDelegate::ImportImpl(Decl *From) {
+  // FIXME: The Minimal import mode of clang::ASTImporter does not correctly
+  // import Lambda definitions. Work around this for now by not importing
+  // lambdas at all. This is most likely encountered when importing decls from
+  // the `std` module (not from debug-info), where lambdas can be defined in
+  // inline function bodies. Those will be imported by LLDB.
+  if (const auto *CXX = llvm::dyn_cast<clang::CXXRecordDecl>(From))
+    if (CXX->isLambda())
+      return llvm::make_error<ASTImportError>(
+          ASTImportError::UnsupportedConstruct);
+
   if (m_std_handler) {
     std::optional<Decl *> D = m_std_handler->Import(From);
     if (D) {
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 9a869f3ea028..862082f8a16b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -790,31 +790,27 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibcxxStringSummaryProviderASCII,
                 "std::string summary provider",
                 "^std::__[[:alnum:]]+::basic_string<char, "
-                "std::__[[:alnum:]]+::char_traits<char>, "
-                "std::__[[:alnum:]]+::allocator<char> >$",
+                "std::__[[:alnum:]]+::char_traits<char>,.*>$",
                 stl_summary_flags, true);
   AddCXXSummary(cpp_category_sp,
                 lldb_private::formatters::LibcxxStringSummaryProviderASCII,
                 "std::string summary provider",
                 "^std::__[[:alnum:]]+::basic_string<unsigned char, "
-                "std::__[[:alnum:]]+::char_traits<unsigned char>, "
-                "std::__[[:alnum:]]+::allocator<unsigned char> >$",
+                "std::__[[:alnum:]]+::char_traits<unsigned char>,.*>$",
                 stl_summary_flags, true);
 
   AddCXXSummary(cpp_category_sp,
                 lldb_private::formatters::LibcxxStringSummaryProviderUTF16,
                 "std::u16string summary provider",
                 "^std::__[[:alnum:]]+::basic_string<char16_t, "
-                "std::__[[:alnum:]]+::char_traits<char16_t>, "
-                "std::__[[:alnum:]]+::allocator<char16_t> >$",
+                "std::__[[:alnum:]]+::char_traits<char16_t>,.*>$",
                 stl_summary_flags, true);
 
   AddCXXSummary(cpp_category_sp,
                 lldb_private::formatters::LibcxxStringSummaryProviderUTF32,
                 "std::u32string summary provider",
                 "^std::__[[:alnum:]]+::basic_string<char32_t, "
-                "std::__[[:alnum:]]+::char_traits<char32_t>, "
-                "std::__[[:alnum:]]+::allocator<char32_t> >$",
+                "std::__[[:alnum:]]+::char_traits<char32_t>,.*>$",
                 stl_summary_flags, true);
 
   AddCXXSummary(cpp_category_sp,
@@ -825,8 +821,7 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
                 lldb_private::formatters::LibcxxWStringSummaryProvider,
                 "std::wstring summary provider",
                 "^std::__[[:alnum:]]+::basic_string<wchar_t, "
-                "std::__[[:alnum:]]+::char_traits<wchar_t>, "
-                "std::__[[:alnum:]]+::allocator<wchar_t> >$",
+                "std::__[[:alnum:]]+::char_traits<wchar_t>,.*>$",
                 stl_summary_flags, true);
 
   AddCXXSummary(cpp_category_sp,
@@ -1342,24 +1337,16 @@ static void RegisterStdStringSummaryProvider(
 
   category_sp->AddTypeSummary(makeSpecifier(string_ty), summary_sp);
 
-  // std::basic_string<char>
   category_sp->AddTypeSummary(
       makeSpecifier(llvm::formatv("std::basic_string<{}>", char_ty).str()),
       summary_sp);
-  // std::basic_string<char,std::char_traits<char>,std::allocator<char> >
-  category_sp->AddTypeSummary(
-      makeSpecifier(llvm::formatv("std::basic_string<{0},std::char_traits<{0}>,"
-                                  "std::allocator<{0}> >",
-                                  char_ty)
-                        .str()),
-      summary_sp);
-  // std::basic_string<char, std::char_traits<char>, std::allocator<char> >
+
   category_sp->AddTypeSummary(
-      makeSpecifier(
-          llvm::formatv("std::basic_string<{0}, std::char_traits<{0}>, "
-                        "std::allocator<{0}> >",
+      std::make_shared<lldb_private::TypeNameSpecifierImpl>(
+          llvm::formatv("^std::basic_string<{0}, ?std::char_traits<{0}>,.*>$",
                         char_ty)
-              .str()),
+              .str(),
+          eFormatterMatchRegex),
       summary_sp);
 }
 
@@ -1382,20 +1369,17 @@ static void LoadLibStdcppFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
   cpp_category_sp->AddTypeSummary("std::__cxx11::string", eFormatterMatchExact,
                                   string_summary_sp);
   cpp_category_sp->AddTypeSummary(
-      "std::__cxx11::basic_string<char, std::char_traits<char>, "
-      "std::allocator<char> >",
-      eFormatterMatchExact, string_summary_sp);
-  cpp_category_sp->AddTypeSummary("std::__cxx11::basic_string<unsigned char, "
-                                  "std::char_traits<unsigned char>, "
-                                  "std::allocator<unsigned char> >",
-                                  eFormatterMatchExact, string_summary_sp);
+      "^std::__cxx11::basic_string<char, std::char_traits<char>,.*>$",
+      eFormatterMatchRegex, string_summary_sp);
+  cpp_category_sp->AddTypeSummary("^std::__cxx11::basic_string<unsigned char, "
+                                  "std::char_traits<unsigned char>,.*>$",
+                                  eFormatterMatchRegex, string_summary_sp);
 
   cpp_category_sp->AddTypeSummary("std::__cxx11::wstring", eFormatterMatchExact,
                                   string_summary_sp);
   cpp_category_sp->AddTypeSummary(
-      "std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>, "
-      "std::allocator<wchar_t> >",
-      eFormatterMatchExact, string_summary_sp);
+      "^std::__cxx11::basic_string<wchar_t, std::char_traits<wchar_t>,.*>$",
+      eFormatterMatchRegex, string_summary_sp);
 
   SyntheticChildren::Flags stl_synth_flags;
   stl_synth_flags.SetCascades(true).SetSkipPointers(false).SetSkipReferences(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index a7874047942c..6053d042b29b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -49,11 +49,6 @@ static void consumeInlineNamespace(llvm::StringRef &name) {
   }
 }
 
-bool lldb_private::formatters::isOldCompressedPairLayout(
-    ValueObject &pair_obj) {
-  return isStdTemplate(pair_obj.GetTypeName(), "__compressed_pair");
-}
-
 bool lldb_private::formatters::isStdTemplate(ConstString type_name,
                                              llvm::StringRef type) {
   llvm::StringRef name = type_name.GetStringRef();
@@ -105,6 +100,44 @@ lldb_private::formatters::GetSecondValueOfLibCXXCompressedPair(
   return value;
 }
 
+std::pair<lldb::ValueObjectSP, bool>
+lldb_private::formatters::GetValueOrOldCompressedPair(
+    ValueObject &obj, size_t anon_struct_idx, llvm::StringRef child_name,
+    llvm::StringRef compressed_pair_name) {
+  auto is_old_compressed_pair = [](ValueObject &pair_obj) -> bool {
+    return isStdTemplate(pair_obj.GetTypeName(), "__compressed_pair");
+  };
+
+  // Try searching the child member in an anonymous structure first.
+  if (auto unwrapped = obj.GetChildAtIndex(anon_struct_idx)) {
+    ValueObjectSP node_sp(obj.GetChildMemberWithName(child_name));
+    if (node_sp)
+      return {node_sp, is_old_compressed_pair(*node_sp)};
+  }
+
+  // Older versions of libc++ don't wrap the children in anonymous structures.
+  // Try that instead.
+  ValueObjectSP node_sp(obj.GetChildMemberWithName(child_name));
+  if (node_sp)
+    return {node_sp, is_old_compressed_pair(*node_sp)};
+
+  // Try the even older __compressed_pair layout.
+
+  assert(!compressed_pair_name.empty());
+
+  node_sp = obj.GetChildMemberWithName(compressed_pair_name);
+
+  // Unrecognized layout (possibly older than LLDB supports).
+  if (!node_sp)
+    return {nullptr, false};
+
+  // Expected old compressed_pair layout, but got something else.
+  if (!is_old_compressed_pair(*node_sp))
+    return {nullptr, false};
+
+  return {node_sp, true};
+}
+
 bool lldb_private::formatters::LibcxxFunctionSummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
 
@@ -205,11 +238,12 @@ bool lldb_private::formatters::LibcxxUniquePointerSummaryProvider(
   if (!valobj_sp)
     return false;
 
-  ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_"));
+  auto [ptr_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *valobj_sp, /*anon_struct_idx=*/0, "__ptr_", "__ptr_");
   if (!ptr_sp)
     return false;
 
-  if (isOldCompressedPairLayout(*ptr_sp))
+  if (is_compressed_pair)
     ptr_sp = GetFirstValueOfLibCXXCompressedPair(*ptr_sp);
 
   if (!ptr_sp)
@@ -379,13 +413,14 @@ lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd::Update() {
   if (!valobj_sp)
     return lldb::ChildCacheState::eRefetch;
 
-  ValueObjectSP ptr_sp(valobj_sp->GetChildMemberWithName("__ptr_"));
+  auto [ptr_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *valobj_sp, /*anon_struct_idx=*/0, "__ptr_", "__ptr_");
   if (!ptr_sp)
     return lldb::ChildCacheState::eRefetch;
 
   // Retrieve the actual pointer and the deleter, and clone them to give them
   // user-friendly names.
-  if (isOldCompressedPairLayout(*ptr_sp)) {
+  if (is_compressed_pair) {
     if (ValueObjectSP value_pointer_sp =
             GetFirstValueOfLibCXXCompressedPair(*ptr_sp))
       m_value_ptr_sp = value_pointer_sp->Clone(ConstString("pointer"));
@@ -424,17 +459,15 @@ enum class StringLayout { CSD, DSC };
 }
 
 static ValueObjectSP ExtractLibCxxStringData(ValueObject &valobj) {
-  if (auto rep_sp = valobj.GetChildMemberWithName("__rep_"))
-    return rep_sp;
-
-  ValueObjectSP valobj_r_sp = valobj.GetChildMemberWithName("__r_");
-  if (!valobj_r_sp || !valobj_r_sp->GetError().Success())
+  auto [valobj_r_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      valobj, /*anon_struct_idx=*/0, "__rep_", "__r_");
+  if (!valobj_r_sp)
     return nullptr;
 
-  if (!isOldCompressedPairLayout(*valobj_r_sp))
-    return nullptr;
+  if (is_compressed_pair)
+    return GetFirstValueOfLibCXXCompressedPair(*valobj_r_sp);
 
-  return GetFirstValueOfLibCXXCompressedPair(*valobj_r_sp);
+  return valobj_r_sp;
 }
 
 /// Determine the size in bytes of \p valobj (a libc++ std::string object) and
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index d88a6ecb1fa8..819f8a985f9b 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -25,7 +25,22 @@ GetChildMemberWithName(ValueObject &obj,
 
 lldb::ValueObjectSP GetFirstValueOfLibCXXCompressedPair(ValueObject &pair);
 lldb::ValueObjectSP GetSecondValueOfLibCXXCompressedPair(ValueObject &pair);
-bool isOldCompressedPairLayout(ValueObject &pair_obj);
+
+/// Returns the ValueObjectSP of the child of \c obj. If \c obj has no
+/// child named \c child_name, returns the __compressed_pair child instead
+/// with \c compressed_pair_name, if one exists.
+///
+/// Latest libc++ wrap the compressed children in an anonymous structure.
+/// The \c anon_struct_idx indicates the location of this struct.
+///
+/// The returned boolean is \c true if the returned child was has an old-style
+/// libc++ __compressed_pair layout.
+///
+/// If no child was found returns a nullptr.
+std::pair<lldb::ValueObjectSP, bool>
+GetValueOrOldCompressedPair(ValueObject &obj, size_t anon_struct_idx,
+                            llvm::StringRef child_name,
+                            llvm::StringRef compressed_pair_name);
 bool isStdTemplate(ConstString type_name, llvm::StringRef type);
 
 bool LibcxxStringSummaryProviderASCII(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
index 826e6ab090e1..dfc23266fc14 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp
@@ -297,11 +297,18 @@ lldb::ChildCacheState ForwardListFrontEnd::Update() {
   if (err.Fail() || !backend_addr)
     return lldb::ChildCacheState::eRefetch;
 
-  ValueObjectSP impl_sp(m_backend.GetChildMemberWithName("__before_begin_"));
+  auto list_base_sp = m_backend.GetChildAtIndex(0);
+  if (!list_base_sp)
+    return lldb::ChildCacheState::eRefetch;
+
+  // Anonymous strucutre index is in base class at index 0.
+  auto [impl_sp, is_compressed_pair] =
+      GetValueOrOldCompressedPair(*list_base_sp, /*anon_struct_idx=*/0,
+                                  "__before_begin_", "__before_begin_");
   if (!impl_sp)
     return ChildCacheState::eRefetch;
 
-  if (isOldCompressedPairLayout(*impl_sp))
+  if (is_compressed_pair)
     impl_sp = GetFirstValueOfLibCXXCompressedPair(*impl_sp);
 
   if (!impl_sp)
@@ -324,17 +331,10 @@ llvm::Expected<uint32_t> ListFrontEnd::CalculateNumChildren() {
   if (!m_head || !m_tail || m_node_address == 0)
     return 0;
 
-  ValueObjectSP size_node_sp(m_backend.GetChildMemberWithName("__size_"));
-  if (!size_node_sp) {
-    size_node_sp = m_backend.GetChildMemberWithName(
-        "__size_alloc_"); // pre-compressed_pair rework
-
-    if (!isOldCompressedPairLayout(*size_node_sp))
-      return llvm::createStringError("Unexpected std::list layout: expected "
-                                     "old __compressed_pair layout.");
-
+  auto [size_node_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      m_backend, /*anon_struct_idx=*/1, "__size_", "__size_alloc_");
+  if (is_compressed_pair)
     size_node_sp = GetFirstValueOfLibCXXCompressedPair(*size_node_sp);
-  }
 
   if (size_node_sp)
     m_count = size_node_sp->GetValueAsUnsigned(UINT32_MAX);
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
index 41441dfbc718..85766966f155 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxMap.cpp
@@ -200,7 +200,8 @@ class LibcxxStdMapSyntheticFrontEnd : public SyntheticChildrenFrontEnd {
   llvm::Expected<size_t> GetIndexOfChildWithName(ConstString name) override;
 
 private:
-  llvm::Expected<uint32_t> CalculateNumChildrenForOldCompressedPairLayout();
+  llvm::Expected<uint32_t>
+  CalculateNumChildrenForOldCompressedPairLayout(ValueObject &pair);
 
   /// Returns the ValueObject for the __tree_node type that
   /// holds the key/value pair of the node at index \ref idx.
@@ -254,16 +255,8 @@ lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::
 
 llvm::Expected<uint32_t>
 lldb_private::formatters::LibcxxStdMapSyntheticFrontEnd::
-    CalculateNumChildrenForOldCompressedPairLayout() {
-  ValueObjectSP node_sp(m_tree->GetChildMemberWithName("__pair3_"));
-  if (!node_sp)
-    return 0;
-
-  if (!isOldCompressedPairLayout(*node_sp))
-    return llvm::createStringError("Unexpected std::map layout: expected "
-                                   "old __compressed_pair layout.");
-
-  node_sp = GetFirstValueOfLibCXXCompressedPair(*node_sp);
+    CalculateNumChildrenForOldCompressedPairLayout(ValueObject &pair) {
+  auto node_sp = GetFirstValueOfLibCXXCompressedPair(pair);
 
   if (!node_sp)
     return 0;
@@ -281,12 +274,16 @@ llvm::Expected<uint32_t> lldb_private::formatters::
   if (m_tree == nullptr)
     return 0;
 
-  if (auto node_sp = m_tree->GetChildMemberWithName("__size_")) {
-    m_count = node_sp->GetValueAsUnsigned(0);
-    return m_count;
-  }
+  auto [size_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *m_tree, /*anon_struct_idx=*/2, "__size_", "__pair3_");
+  if (!size_sp)
+    return llvm::createStringError("Unexpected std::map layout");
 
-  return CalculateNumChildrenForOldCompressedPairLayout();
+  if (is_compressed_pair)
+    return CalculateNumChildrenForOldCompressedPairLayout(*size_sp);
+
+  m_count = size_sp->GetValueAsUnsigned(0);
+  return m_count;
 }
 
 ValueObjectSP
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
index 501fd0945b82..f88a5319068a 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxUnorderedMap.cpp
@@ -130,22 +130,17 @@ CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
 
 CompilerType lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     GetNodeType() {
-  auto node_sp = m_backend.GetChildAtNamePath({"__table_", "__first_node_"});
-
-  if (!node_sp) {
-    auto p1_sp = m_backend.GetChildAtNamePath({"__table_", "__p1_"});
-    if (!p1_sp)
-      return {};
+  auto table_sp = m_backend.GetChildMemberWithName("__table_");
+  if (!table_sp)
+    return {};
 
-    if (!isOldCompressedPairLayout(*p1_sp))
-      return {};
+  auto [node_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      *table_sp, /*anon_struct_idx=*/1, "__first_node_", "__p1_");
+  if (is_compressed_pair)
+    node_sp = GetFirstValueOfLibCXXCompressedPair(*node_sp);
 
-    node_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp);
-    if (!node_sp)
-      return {};
-  }
-
-  assert(node_sp);
+  if (!node_sp)
+    return {};
 
   return node_sp->GetCompilerType().GetTypeTemplateArgument(0).GetPointeeType();
 }
@@ -223,19 +218,15 @@ lldb::ValueObjectSP lldb_private::formatters::
 llvm::Expected<size_t>
 lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
     CalculateNumChildrenImpl(ValueObject &table) {
-  if (auto size_sp = table.GetChildMemberWithName("__size_"))
+  auto [size_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      table, /*anon_struct_idx=*/2, "__size_", "__p2_");
+  if (!is_compressed_pair && size_sp)
     return size_sp->GetValueAsUnsigned(0);
 
-  ValueObjectSP p2_sp = table.GetChildMemberWithName("__p2_");
-  if (!p2_sp)
-    return llvm::createStringError(
-        "Unexpected std::unordered_map layout: __p2_ member not found.");
+  if (!is_compressed_pair)
+    return llvm::createStringError("Unsupported std::unordered_map layout.");
 
-  if (!isOldCompressedPairLayout(*p2_sp))
-    return llvm::createStringError("Unexpected std::unordered_map layout: old "
-                                   "__compressed_pair layout not found.");
-
-  ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*p2_sp);
+  ValueObjectSP num_elements_sp = GetFirstValueOfLibCXXCompressedPair(*size_sp);
 
   if (!num_elements_sp)
     return llvm::createStringError(
@@ -246,19 +237,13 @@ lldb_private::formatters::LibcxxStdUnorderedMapSyntheticFrontEnd::
 }
 
 static ValueObjectSP GetTreePointer(ValueObject &table) {
-  ValueObjectSP tree_sp = table.GetChildMemberWithName("__first_node_");
-  if (!tree_sp) {
-    ValueObjectSP p1_sp = table.GetChildMemberWithName("__p1_");
-    if (!p1_sp)
-      return nullptr;
-
-    if (!isOldCompressedPairLayout(*p1_sp))
-      return nullptr;
-
-    tree_sp = GetFirstValueOfLibCXXCompressedPair(*p1_sp);
-    if (!tree_sp)
-      return nullptr;
-  }
+  auto [tree_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      table, /*anon_struct_idx=*/1, "__first_node_", "__p1_");
+  if (is_compressed_pair)
+    tree_sp = GetFirstValueOfLibCXXCompressedPair(*tree_sp);
+
+  if (!tree_sp)
+    return nullptr;
 
   return tree_sp->GetChildMemberWithName("__next_");
 }
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
index 4bcdf01c221a..60913e5c1ac5 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxVector.cpp
@@ -126,17 +126,15 @@ lldb_private::formatters::LibcxxStdVectorSyntheticFrontEnd::GetChildAtIndex(
 }
 
 static ValueObjectSP GetDataPointer(ValueObject &root) {
-  if (auto cap_sp = root.GetChildMemberWithName("__cap_"))
-    return cap_sp;
-
-  ValueObjectSP cap_sp = root.GetChildMemberWithName("__end_cap_");
+  auto [cap_sp, is_compressed_pair] = GetValueOrOldCompressedPair(
+      root, /*anon_struct_idx=*/2, "__cap_", "__end_cap_");
   if (!cap_sp)
     return nullptr;
 
-  if (!isOldCompressedPairLayout(*cap_sp))
-    return nullptr;
+  if (is_compressed_pair)
+    return GetFirstValueOfLibCXXCompressedPair(*cap_sp);
 
-  return GetFirstValueOfLibCXXCompressedPair(*cap_sp);
+  return cap_sp;
 }
 
 lldb::ChildCacheState
diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
index 7adc00622ec2..d21dac221aa2 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
@@ -44,7 +44,8 @@ NativeRegisterContextFreeBSD::CreateHostNativeRegisterContextFreeBSD(
     NativeProcessFreeBSD &process = native_thread.GetProcess();
     g_register_flags_detector.DetectFields(
         process.GetAuxValue(AuxVector::AUXV_FREEBSD_AT_HWCAP).value_or(0),
-        process.GetAuxValue(AuxVector::AUXV_AT_HWCAP2).value_or(0));
+        process.GetAuxValue(AuxVector::AUXV_AT_HWCAP2).value_or(0),
+        /*hwcap3=*/0);
   }
 
   return new NativeRegisterContextFreeBSD_arm64(target_arch, native_thread);
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index 884c7d4b9e35..b1c7421bef8d 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -162,10 +162,13 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux(
 
     opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskTLS);
 
+    std::optional<uint64_t> auxv_at_hwcap3 =
+        process.GetAuxValue(AuxVector::AUXV_AT_HWCAP3);
     std::lock_guard<std::mutex> lock(g_register_flags_detector_mutex);
     if (!g_register_flags_detector.HasDetected())
       g_register_flags_detector.DetectFields(auxv_at_hwcap.value_or(0),
-                                             auxv_at_hwcap2.value_or(0));
+                                             auxv_at_hwcap2.value_or(0),
+                                             auxv_at_hwcap3.value_or(0));
 
     auto register_info_up =
         std::make_unique<RegisterInfoPOSIX_arm64>(target_arch, opt_regsets);
diff --git a/lldb/source/Plugins/Process/Utility/AuxVector.cpp b/lldb/source/Plugins/Process/Utility/AuxVector.cpp
index f495ffb1924e..50500a8593e1 100644
--- a/lldb/source/Plugins/Process/Utility/AuxVector.cpp
+++ b/lldb/source/Plugins/Process/Utility/AuxVector.cpp
@@ -84,6 +84,7 @@ const char *AuxVector::GetEntryName(EntryType type) const {
     case ENTRY_NAME(AUXV_AT_BASE_PLATFORM);  break;
     case ENTRY_NAME(AUXV_AT_RANDOM);         break;
     case ENTRY_NAME(AUXV_AT_HWCAP2);         break;
+    case ENTRY_NAME(AUXV_AT_HWCAP3);         break;
     case ENTRY_NAME(AUXV_AT_EXECFN);         break;
     case ENTRY_NAME(AUXV_AT_SYSINFO);        break;
     case ENTRY_NAME(AUXV_AT_SYSINFO_EHDR);   break;
diff --git a/lldb/source/Plugins/Process/Utility/AuxVector.h b/lldb/source/Plugins/Process/Utility/AuxVector.h
index 2670b34f6b0a..7733e0ffc683 100644
--- a/lldb/source/Plugins/Process/Utility/AuxVector.h
+++ b/lldb/source/Plugins/Process/Utility/AuxVector.h
@@ -57,6 +57,7 @@ class AuxVector {
     AUXV_AT_BASE_PLATFORM = 24, ///< String identifying real platforms.
     AUXV_AT_RANDOM = 25,        ///< Address of 16 random bytes.
     AUXV_AT_HWCAP2 = 26,        ///< Extension of AT_HWCAP.
+    AUXV_AT_HWCAP3 = 29,        ///< Extension of AT_HWCAP.
     AUXV_AT_EXECFN = 31,        ///< Filename of executable.
     AUXV_AT_SYSINFO = 32, ///< Pointer to the global system page used for system
                           /// calls and other nice things.
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
index 042940b7dff6..330a24af67c4 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
@@ -26,11 +26,15 @@
 #define HWCAP2_EBF16 (1ULL << 32)
 #define HWCAP2_FPMR (1ULL << 48)
 
+#define HWCAP3_MTE_STORE_ONLY (1ULL << 1)
+
 using namespace lldb_private;
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
   (void)hwcap;
+  (void)hwcap3;
 
   if (!(hwcap2 & HWCAP2_FPMR))
     return {};
@@ -53,8 +57,10 @@ Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2) {
 
 Arm64RegisterFlagsDetector::Fields
 Arm64RegisterFlagsDetector::DetectGCSFeatureFields(uint64_t hwcap,
-                                                   uint64_t hwcap2) {
+                                                   uint64_t hwcap2,
+                                                   uint64_t hwcap3) {
   (void)hwcap2;
+  (void)hwcap3;
 
   if (!(hwcap & HWCAP_GCS))
     return {};
@@ -67,8 +73,10 @@ Arm64RegisterFlagsDetector::DetectGCSFeatureFields(uint64_t hwcap,
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
   (void)hwcap;
+  (void)hwcap3;
 
   if (!(hwcap2 & HWCAP2_SME))
     return {};
@@ -83,8 +91,8 @@ Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectMTECtrlFields(uint64_t hwcap,
-                                                uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2,
+                                                uint64_t hwcap3) {
   (void)hwcap;
 
   if (!(hwcap2 & HWCAP2_MTE))
@@ -94,16 +102,29 @@ Arm64RegisterFlagsDetector::DetectMTECtrlFields(uint64_t hwcap,
   // to prctl(PR_TAGGED_ADDR_CTRL...). Fields are derived from the defines
   // used to build the value.
 
+  std::vector<RegisterFlags::Field> fields;
+  fields.reserve(4);
+  if (hwcap3 & HWCAP3_MTE_STORE_ONLY)
+    fields.push_back({"STORE_ONLY", 19});
+
   static const FieldEnum tcf_enum(
       "tcf_enum",
       {{0, "TCF_NONE"}, {1, "TCF_SYNC"}, {2, "TCF_ASYNC"}, {3, "TCF_ASYMM"}});
-  return {{"TAGS", 3, 18}, // 16 bit bitfield shifted up by PR_MTE_TAG_SHIFT.
-          {"TCF", 1, 2, &tcf_enum},
-          {"TAGGED_ADDR_ENABLE", 0}};
+
+  fields.insert(
+      std::end(fields),
+      {{"TAGS", 3, 18}, // 16 bit bitfield shifted up by PR_MTE_TAG_SHIFT.
+       {"TCF", 1, 2, &tcf_enum},
+       {"TAGGED_ADDR_ENABLE", 0}});
+
+  return fields;
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
+  (void)hwcap3;
+
   static const FieldEnum rmode_enum(
       "rmode_enum", {{0, "RN"}, {1, "RP"}, {2, "RM"}, {3, "RZ"}});
 
@@ -142,10 +163,12 @@ Arm64RegisterFlagsDetector::DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2) {
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
   // fpsr's contents are constant.
   (void)hwcap;
   (void)hwcap2;
+  (void)hwcap3;
 
   return {
       // Bits 31-28 are N/Z/C/V, only used by AArch32.
@@ -162,7 +185,10 @@ Arm64RegisterFlagsDetector::DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2) {
 }
 
 Arm64RegisterFlagsDetector::Fields
-Arm64RegisterFlagsDetector::DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2) {
+Arm64RegisterFlagsDetector::DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                             uint64_t hwcap3) {
+  (void)hwcap3;
+
   // The fields here are a combination of the Arm manual's SPSR_EL1,
   // plus a few changes where Linux has decided not to make use of them at all,
   // or at least not from userspace.
@@ -207,9 +233,10 @@ Arm64RegisterFlagsDetector::DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2) {
   return cpsr_fields;
 }
 
-void Arm64RegisterFlagsDetector::DetectFields(uint64_t hwcap, uint64_t hwcap2) {
+void Arm64RegisterFlagsDetector::DetectFields(uint64_t hwcap, uint64_t hwcap2,
+                                              uint64_t hwcap3) {
   for (auto &reg : m_registers)
-    reg.m_flags.SetFields(reg.m_detector(hwcap, hwcap2));
+    reg.m_flags.SetFields(reg.m_detector(hwcap, hwcap2, hwcap3));
   m_has_detected = true;
 }
 
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
index 7daebcc71db0..aec2bf9f4886 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
@@ -40,7 +40,7 @@ class Arm64RegisterFlagsDetector {
   /// If called more than once, fields will be redetected each time from
   /// scratch. If the target would not have this register at all, the list of
   /// fields will be left empty.
-  void DetectFields(uint64_t hwcap, uint64_t hwcap2);
+  void DetectFields(uint64_t hwcap, uint64_t hwcap2, uint64_t hwcap3);
 
   /// Add the field information of any registers named in this class,
   /// to the relevant RegisterInfo instances. Note that this will be done
@@ -53,15 +53,22 @@ class Arm64RegisterFlagsDetector {
 
 private:
   using Fields = std::vector<RegisterFlags::Field>;
-  using DetectorFn = std::function<Fields(uint64_t, uint64_t)>;
+  using DetectorFn = std::function<Fields(uint64_t, uint64_t, uint64_t)>;
 
-  static Fields DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2);
-  static Fields DetectGCSFeatureFields(uint64_t hwcap, uint64_t hwcap2);
+  static Fields DetectCPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectFPSRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectFPCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2,
+                                    uint64_t hwcap3);
+  static Fields DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2,
+                                 uint64_t hwcap3);
+  static Fields DetectGCSFeatureFields(uint64_t hwcap, uint64_t hwcap2,
+                                       uint64_t hwcap3);
 
   struct RegisterEntry {
     RegisterEntry(llvm::StringRef name, unsigned size, DetectorFn detector)
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
index bd02bb0e69a4..d5046d369ab2 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
@@ -96,14 +96,19 @@ RegisterContextCorePOSIX_arm64::RegisterContextCorePOSIX_arm64(
   llvm::Triple::OSType os = process->GetArchitecture().GetTriple().getOS();
   if ((os == llvm::Triple::Linux) || (os == llvm::Triple::FreeBSD)) {
     AuxVector aux_vec(process->GetAuxvData());
-    std::optional<uint64_t> auxv_at_hwcap = aux_vec.GetAuxValue(
-        os == llvm::Triple::FreeBSD ? AuxVector::AUXV_FREEBSD_AT_HWCAP
-                                    : AuxVector::AUXV_AT_HWCAP);
+    bool is_freebsd = os == llvm::Triple::FreeBSD;
+    std::optional<uint64_t> auxv_at_hwcap =
+        aux_vec.GetAuxValue(is_freebsd ? AuxVector::AUXV_FREEBSD_AT_HWCAP
+                                       : AuxVector::AUXV_AT_HWCAP);
     std::optional<uint64_t> auxv_at_hwcap2 =
         aux_vec.GetAuxValue(AuxVector::AUXV_AT_HWCAP2);
+    std::optional<uint64_t> auxv_at_hwcap3 =
+        is_freebsd ? std::nullopt
+                   : aux_vec.GetAuxValue(AuxVector::AUXV_AT_HWCAP3);
 
     m_register_flags_detector.DetectFields(auxv_at_hwcap.value_or(0),
-                                           auxv_at_hwcap2.value_or(0));
+                                           auxv_at_hwcap2.value_or(0),
+                                           auxv_at_hwcap3.value_or(0));
     m_register_flags_detector.UpdateRegisterInfo(GetRegisterInfo(),
                                                  GetRegisterCount());
   }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index c76d67b47b33..8916c58beec0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -2201,6 +2201,18 @@ bool DWARFASTParserClang::CompleteRecordType(const DWARFDIE &die,
       for (DelayedAddObjCClassProperty &property : delayed_properties)
         property.Finalize();
     }
+  } else if (Language::LanguageIsObjC(
+                 static_cast<LanguageType>(die.GetAttributeValueAsUnsigned(
+                     DW_AT_APPLE_runtime_class, eLanguageTypeUnknown)))) {
+    /// The forward declaration was C++ but the definition is Objective-C.
+    /// We currently don't handle such situations. In such cases, keep the
+    /// forward declaration without a definition to avoid violating Clang AST
+    /// invariants.
+    LLDB_LOG(GetLog(LLDBLog::Expressions),
+             "WARNING: Type completion aborted because forward declaration for "
+             "'{0}' is C++ while definition is Objective-C.",
+             llvm::StringRef(die.GetName()));
+    return {};
   }
 
   if (!bases.empty()) {
diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td
index 4aa9e046d607..656503bb8d22 100644
--- a/lldb/source/Target/TargetProperties.td
+++ b/lldb/source/Target/TargetProperties.td
@@ -5,8 +5,8 @@ let Definition = "target_experimental" in {
     Global, DefaultTrue,
     Desc<"If true, inject local variables explicitly into the expression text. This will fix symbol resolution when there are name collisions between ivars and local variables. But it can make expressions run much more slowly.">;
   def UseDIL : Property<"use-DIL", "Boolean">,
-    Global, DefaultFalse,
-    Desc<"If true, use the alternative DIL implementation for frame variable evaluation.">;
+    Global, DefaultTrue,
+    Desc<"If true, use the DIL implementation for frame variable evaluation.">;
 }
 
 let Definition = "target" in {
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index fd3f9f872460..6f28434c646c 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -303,7 +303,7 @@ Interpreter::Visit(const MemberOfNode *node) {
     }
   }
 
-  if (field_obj && field_obj->GetName() == node->GetFieldName()) {
+  if (field_obj) {
     if (m_use_dynamic != lldb::eNoDynamicValues) {
       lldb::ValueObjectSP dynamic_val_sp =
           field_obj->GetDynamicValue(m_use_dynamic);
diff --git a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
index b2ce9602e6a5..8c009aa182d0 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/TestFrameVarDILQualifiedId.py
@@ -29,3 +29,17 @@ def test_frame_var(self):
         self.expect_var_path("ns::i", value="1")
         self.expect_var_path("::ns::ns::i", value="2")
         self.expect_var_path("ns::ns::i", value="2")
+
+        self.expect_var_path("foo", value="1")
+        self.expect_var_path("::(anonymous namespace)::foo", value="13")
+        self.expect_var_path("(anonymous namespace)::foo", value="13")
+        self.expect_var_path("ns1::(anonymous namespace)::foo", value="5")
+        self.expect_var_path(
+            "(anonymous namespace)::ns2::(anonymous namespace)::foo",
+            value="7",
+        )
+        self.expect_var_path("::ns1::(anonymous namespace)::foo", value="5")
+        self.expect_var_path(
+            "::(anonymous namespace)::ns2::(anonymous namespace)::foo",
+            value="7",
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
index 8a5c47a6f364..10ffa1e54a99 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
+++ b/lldb/test/API/commands/frame/var-dil/basics/QualifiedId/main.cpp
@@ -10,7 +10,26 @@ int i = 2;
 
 } // namespace ns
 
+namespace {
+int foo = 13;
+}
+
+namespace ns1 {
+namespace {
+int foo = 5;
+}
+} // namespace ns1
+
+namespace {
+namespace ns2 {
+namespace {
+int foo = 7;
+}
+} // namespace ns2
+} // namespace
+
 int main(int argc, char **argv) {
+  int foo = 1;
 
-  return 0; // Set a breakpoint here
+  return foo + ::foo + ns1::foo + ns2::foo; // Set a breakpoint here
 }
diff --git a/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py b/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
index 2570f267bf46..c003d87f8ca3 100644
--- a/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
+++ b/lldb/test/API/commands/register/register/aarch64_mte_ctrl_register/TestMTECtrlRegister.py
@@ -34,29 +34,41 @@ def test_mte_ctrl_register(self):
             substrs=["stop reason = breakpoint 1."],
         )
 
-        def check_mte_ctrl(async_err, sync_err):
+        has_store_only = self.isAArch64MTEStoreOnly()
+
+        def check_mte_ctrl(async_err, sync_err, store_only):
             # Bit 0 = tagged addressing enabled
             # Bit 1 = synchronous faults
             # Bit 2 = asynchronous faults
-            value = "0x{:016x}".format((async_err << 2) | (sync_err << 1) | 1)
+            # Bit 19 = store only checking mode
+            value = "0x{:016x}".format(
+                (store_only << 19) | (async_err << 2) | (sync_err << 1) | 1
+            )
             expected = [value]
 
             if self.hasXMLSupport():
+                fields = "("
+                if has_store_only:
+                    fields += f"STORE_ONLY = {store_only}, "
+
                 tfc_modes = ["NONE", "SYNC", "ASYNC", "ASYMM"]
-                expected.append(
-                    f"(TAGS = 0, TCF = TCF_{tfc_modes[async_err << 1 | sync_err]}, TAGGED_ADDR_ENABLE = 1)".format(
-                        async_err, sync_err
-                    )
-                )
+                fields += f"TAGS = 0, TCF = TCF_{tfc_modes[async_err << 1 | sync_err]}, TAGGED_ADDR_ENABLE = 1)"
+
+                expected.append(fields)
 
             self.expect("register read mte_ctrl", substrs=expected)
 
         # We start enabled with synchronous faults.
-        check_mte_ctrl(0, 1)
+        check_mte_ctrl(0, 1, 0)
         # Change to asynchronous faults.
         self.runCmd("register write mte_ctrl 5")
-        check_mte_ctrl(1, 0)
+        check_mte_ctrl(1, 0, 0)
         # This would return to synchronous faults if we did not restore the
         # previous value.
         self.expect("expression setup_mte()", substrs=["= 0"])
-        check_mte_ctrl(1, 0)
+        check_mte_ctrl(1, 0, 0)
+
+        # Store only checking requires FEAT_MTE_STORE_ONLY.
+        if has_store_only:
+            self.runCmd(f"register write mte_ctrl {1 | (1 << 19)}")
+            check_mte_ctrl(0, 0, 1)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
index 49c0c814f077..8ce46ba3b886 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/TestDataFormatterStdString.py
@@ -80,6 +80,8 @@ def cleanup():
                 '(%s::string) Q = "quite a long std::strin with lots of info inside it"'
                 % ns,
                 "(%s::string *) null_str = nullptr" % ns,
+                '(CustomString) custom_str = "hello!"',
+                '(CustomWString) custom_wstr = L"hello!"',
             ],
         )
 
@@ -143,6 +145,10 @@ def do_test_multibyte(self):
                 '(%s::u16string) u16_empty = u""' % ns,
                 '(%s::u32string) u32_string = U"🍄🍅🍆🍌"' % ns,
                 '(%s::u32string) u32_empty = U""' % ns,
+                '(CustomStringU16) custom_u16 = u"ß水氶"',
+                '(CustomStringU16) custom_u16_empty = u""',
+                '(CustomStringU32) custom_u32 = U"🍄🍅🍆🍌"',
+                '(CustomStringU32) custom_u32_empty = U""',
             ],
         )
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/main.cpp
index f22c890861d0..577b78e35fc1 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/string/main.cpp
@@ -1,6 +1,33 @@
+#include <cstdlib>
 #include <stdint.h>
 #include <string>
 
+template <typename T> struct CustomAlloc {
+  using value_type = T;
+  using pointer = value_type *;
+  using const_pointer = const value_type *;
+  using size_type = std::size_t;
+
+  pointer allocate(size_type n) { return (T *)malloc(n * sizeof(T)); }
+
+  void deallocate(pointer p, size_type) {
+    if (p)
+      free(p);
+  }
+};
+
+using CustomString =
+    std::basic_string<char, std::char_traits<char>, CustomAlloc<char>>;
+
+using CustomWString =
+    std::basic_string<wchar_t, std::char_traits<wchar_t>, CustomAlloc<wchar_t>>;
+
+using CustomStringU16 = std::basic_string<char16_t, std::char_traits<char16_t>,
+                                          CustomAlloc<char16_t>>;
+
+using CustomStringU32 = std::basic_string<char32_t, std::char_traits<char32_t>,
+                                          CustomAlloc<char32_t>>;
+
 size_t touch_string(std::string &in_str) {
   return in_str.size(); // Break here to look at bad string
 }
@@ -99,8 +126,16 @@ int main() {
   std::string *pq = &q;
   std::string *pQ = &Q;
 
+  CustomString custom_str("hello!");
+  CustomWString custom_wstr(L"hello!");
+  CustomStringU16 custom_u16(u16_string.c_str());
+  CustomStringU16 custom_u16_empty(u"");
+  CustomStringU32 custom_u32(u32_string.c_str());
+  CustomStringU32 custom_u32_empty(U"");
+
   S.assign(L"!!!!!"); // Set break point at this line.
   std::string *not_a_string = (std::string *)0x0;
   touch_string(*not_a_string);
+
   return 0;
 }
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp
index f10811817c0d..5943b35deab8 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/invalid-vector/main.cpp
@@ -1,4 +1,4 @@
-#define COMPRESSED_PAIR_REV 3
+#define COMPRESSED_PAIR_REV 4
 #include <libcxx-simulators-common/compressed_pair.h>
 
 namespace std {
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py
index c8d9c2e389a0..f27fc2e3c456 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/TestDataFormatterLibcxxStringSimulator.py
@@ -28,7 +28,7 @@ def _run_test(self, defines):
 
 for v in [None, "ALTERNATE_LAYOUT"]:
     for r in range(6):
-        for c in range(4):
+        for c in range(5):
             name = "test_r%d_c%d" % (r, c)
             defines = ["REVISION=%d" % r, "COMPRESSED_PAIR_REV=%d" % c]
             if v:
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp
index cf431e524069..b19c05159670 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/string/main.cpp
@@ -209,7 +209,7 @@ template <class _CharT, class _Traits, class _Allocator> class basic_string {
   __long &getLongRep() {
 #if COMPRESSED_PAIR_REV == 0
     return __r_.first().__l;
-#elif COMPRESSED_PAIR_REV <= 3
+#else
     return __rep_.__l;
 #endif
   }
@@ -217,14 +217,14 @@ template <class _CharT, class _Traits, class _Allocator> class basic_string {
   __short &getShortRep() {
 #if COMPRESSED_PAIR_REV == 0
     return __r_.first().__s;
-#elif COMPRESSED_PAIR_REV <= 3
+#else
     return __rep_.__s;
 #endif
   }
 
 #if COMPRESSED_PAIR_REV == 0
   std::__lldb::__compressed_pair<__rep, allocator_type> __r_;
-#elif COMPRESSED_PAIR_REV <= 3
+#else
   _LLDB_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_);
 #endif
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py
index e623c3a1413b..1e25ac947203 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/TestDataFormatterLibcxxUniquePtrSimulator.py
@@ -26,7 +26,7 @@ def _run_test(self, defines):
         )
 
 
-for r in range(4):
+for r in range(5):
     name = "test_r%d" % r
     defines = ["COMPRESSED_PAIR_REV=%d" % r]
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp
index 3d174a91cc26..bd840aaceffa 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx-simulators/unique_ptr/main.cpp
@@ -20,8 +20,7 @@ template <class _Tp, class _Dp = default_delete<_Tp>> class unique_ptr {
   std::__lldb::__compressed_pair<pointer, deleter_type> __ptr_;
   explicit unique_ptr(pointer __p) noexcept
       : __ptr_(__p, std::__lldb::__value_init_tag()) {}
-#elif COMPRESSED_PAIR_REV == 1 || COMPRESSED_PAIR_REV == 2 ||                  \
-    COMPRESSED_PAIR_REV == 3
+#else
   _LLDB_COMPRESSED_PAIR(pointer, __ptr_, deleter_type, __deleter_);
   explicit unique_ptr(pointer __p) noexcept : __ptr_(__p), __deleter_() {}
 #endif
diff --git a/lldb/test/API/lang/cpp/lambdas/TestLambdas.py b/lldb/test/API/lang/cpp/lambdas/TestLambdas.py
index c8308c16011e..112f375c0ef2 100644
--- a/lldb/test/API/lang/cpp/lambdas/TestLambdas.py
+++ b/lldb/test/API/lang/cpp/lambdas/TestLambdas.py
@@ -1,4 +1,12 @@
 from lldbsuite.test import lldbinline
 from lldbsuite.test import decorators
 
-lldbinline.MakeInlineTest(__file__, globals())
+lldbinline.MakeInlineTest(
+    __file__,
+    globals(),
+    [
+        decorators.expectedFailureAll(
+            bugnumber="https://github.com/llvm/llvm-project/issues/149477"
+        )
+    ],
+)
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
index bfdc8229094f..825e1a4b79fd 100644
--- a/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
+++ b/lldb/test/API/linux/aarch64/mte_core_file/TestAArch64LinuxMTEMemoryTagCoreFile.py
@@ -10,8 +10,8 @@
 class AArch64LinuxMTEMemoryTagCoreFileTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
-    MTE_BUF_ADDR = hex(0xFFFF82C74000)
-    BUF_ADDR = hex(0xFFFF82C73000)
+    MTE_BUF_ADDR = hex(0xFFFFA733B000)
+    BUF_ADDR = hex(0xFFFFA733A000)
 
     @skipIfLLVMTargetMissing("AArch64")
     def test_mte_tag_core_file_memory_region(self):
@@ -215,7 +215,7 @@ def test_mte_tag_fault_reason(self):
         self.expect(
             "bt",
             substrs=[
-                "* thread #1, name = 'a.out.mte', stop reason = SIGSEGV: sync tag check fault (fault address=0xffff82c74010)"
+                "* thread #1, name = 'a.out.mte', stop reason = SIGSEGV: sync tag check fault (fault address=0xffffa733b010)"
             ],
         )
 
@@ -231,12 +231,15 @@ def test_mte_ctrl_register(self):
         self.runCmd("target create --core core.mte")
         # The expected value is:
         # * Allowed tags value of 0xFFFF, shifted up by 3 resulting in 0x7fff8.
+        # * Bit 19 set to 0, which means that store only checking is disabled.
         # * Bit 1 set to enable synchronous tag faults.
         # * Bit 0 set to enable the tagged address ABI.
         expected = ["mte_ctrl = 0x000000000007fffb"]
 
         if self.hasXMLSupport():
-            expected.append("(TAGS = 65535, TCF = TCF_SYNC, TAGGED_ADDR_ENABLE = 1)")
+            expected.append(
+                "(STORE_ONLY = 0, TAGS = 65535, TCF = TCF_SYNC, TAGGED_ADDR_ENABLE = 1)"
+            )
 
         self.expect("register read mte_ctrl", substrs=expected)
 
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/core.mte b/lldb/test/API/linux/aarch64/mte_core_file/core.mte
index 84a3266667e7..188d06d11c71 100644
Binary files a/lldb/test/API/linux/aarch64/mte_core_file/core.mte and b/lldb/test/API/linux/aarch64/mte_core_file/core.mte differ
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/core.nomte b/lldb/test/API/linux/aarch64/mte_core_file/core.nomte
index 201f2880e6dc..454ff8361cc3 100644
Binary files a/lldb/test/API/linux/aarch64/mte_core_file/core.nomte and b/lldb/test/API/linux/aarch64/mte_core_file/core.nomte differ
diff --git a/lldb/test/API/linux/aarch64/mte_core_file/main.c b/lldb/test/API/linux/aarch64/mte_core_file/main.c
index 6537edd7bdb9..597459459bb0 100644
--- a/lldb/test/API/linux/aarch64/mte_core_file/main.c
+++ b/lldb/test/API/linux/aarch64/mte_core_file/main.c
@@ -23,7 +23,7 @@
 
 int main(int argc, char const *argv[]) {
 #ifdef NO_MTE
-  *(char *)(0) = 0;
+  __builtin_trap();
 #endif
 
   if (prctl(PR_SET_TAGGED_ADDR_CTRL,
diff --git a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
index 0d2774b28171..20a75f4076e4 100644
--- a/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
+++ b/lldb/test/API/tools/lldb-dap/evaluate/TestDAP_evaluate.py
@@ -131,7 +131,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("var2 + struct1.foo")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Expressions at breakpoint 2, which is an anonymous block
         self.continue_to_breakpoint(breakpoint_2)
@@ -169,7 +169,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("var2 + struct1.foo")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Expressions at breakpoint 3, which is inside a_function
         self.continue_to_breakpoint(breakpoint_3)
@@ -195,7 +195,7 @@ def run_test_evaluate_expressions(
             self.assertEvaluateFailure("a_function(1)")
             self.assertEvaluateFailure("list + 1")
             self.assertEvaluateFailure("foo_func")
-            self.assertEvaluateFailure("foo_var")
+            self.assertEvaluate("foo_var", "44")
 
         # Now we check that values are updated after stepping
         self.continue_to_breakpoint(breakpoint_4)
diff --git a/lldb/test/Shell/Expr/TestLambdaExprImport.test b/lldb/test/Shell/Expr/TestLambdaExprImport.test
new file mode 100644
index 000000000000..c57ce06453fe
--- /dev/null
+++ b/lldb/test/Shell/Expr/TestLambdaExprImport.test
@@ -0,0 +1,26 @@
+# Test that we can successfully ASTImport clang::LambdaExpr nodes.
+# Currently this is not supported in MinimalImport mode (which LLDB
+# uses always).
+
+# RUN: split-file %s %t
+# RUN: %clang_host -g -gdwarf %t/main.cpp -o %t.out
+# RUN: %lldb -o "settings set interpreter.stop-command-source-on-error false" \
+# RUN:       -x -b -s %t/commands.input %t.out 2>&1 \
+# RUN:       | FileCheck %s
+
+#--- main.cpp
+
+int main() {
+  __builtin_debugtrap();
+}
+
+#--- commands.input
+
+run
+expression --top-level -- void method(int x) { [x=x] { ; }; }
+target dump typesystem
+
+# CHECK:      expression
+# CHECK:      target dump typesystem
+# CHECK-NOT:  FunctionDecl
+# CHECK-NOT:  LambdaExpr
diff --git a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
index d4fcf78d01b8..c29b51219d19 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
+++ b/lldb/test/Shell/SymbolFile/DWARF/TestDedupWarnings.test
@@ -15,7 +15,7 @@
 # RUN: %clang_host -fmodules -Xclang -fmodules-cache-path=%t/cache -I%t -g -gmodules %t/b.m -o %t/b.o -c
 # RUN: %clang_host %t/a.o %t/b.o -o %t/a.out
 # RUN: rm -rf %t/cache
-# RUN: %lldb %t/a.out -o "b main" -o run -o "p a" -o "p b" -o q 2>&1 | FileCheck %s
+# RUN: %lldb %t/a.out -o "b main" -o run -o "expr a" -o "expr b" -o q 2>&1 | FileCheck %s
 # CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist
 # CHECK-NOT: /cache/{{.*}}/C-{.*}.pcm' does not exist
 # CHECK: {{[ab]}}.o{{.*}}/cache/{{.*}}/C-{{.*}}.pcm' does not exist
diff --git a/lldb/test/Shell/SymbolFile/DWARF/objcxx-forward-decls.test b/lldb/test/Shell/SymbolFile/DWARF/objcxx-forward-decls.test
new file mode 100644
index 000000000000..30109c2943c9
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/objcxx-forward-decls.test
@@ -0,0 +1,70 @@
+# REQUIRES: system-darwin
+
+# In this test we have two CUs with conflicting forward declaration
+# depending on the CU language (one is C++ and the other is Objective-C++).
+# We are then stopped in the C++ CU and try to print the type, at which
+# point LLDB will try to make it into an Clang AST node. If LLDB were to
+# interpret the type as C++ instead of Objective-C, we'd violate Clang
+# invariants and crash.
+#
+# RUN: split-file %s %t
+# RUN: %clangxx_host -c -g -x objective-c++ %t/request.m -o %t/request_objc.o
+# RUN: %clangxx_host -c -g %t/main.cpp -o %t/main.o
+# RUN: %clangxx_host %t/main.o %t/request_objc.o -framework Foundation -o %t/a.out
+#
+# RUN: %lldb %t/a.out \
+# RUN:    -o "breakpoint set -p return -X main" \
+# RUN:    -o run \
+# RUN:    -o "frame variable r" \
+# RUN:    -o exit | FileCheck %s
+#
+# RUN: dsymutil %t/a.out
+#
+# RUN: %lldb %t/a.out \
+# RUN:    -o "breakpoint set -p return -X main" \
+# RUN:    -o run \
+# RUN:    -o "frame variable r" \
+# RUN:    -o exit | FileCheck %s --check-prefix=CHECK-DSYM
+
+# CHECK:      (lldb) frame variable r
+# CHECK-NEXT: (Request) ::r = (m_request = "Hello, World!")
+
+# CHECK-DSYM:      (lldb) frame variable r
+# CHECK-DSYM-NEXT: (Request) ::r = (m_request = "Hello, World!")
+
+#--- lib.h
+#ifndef LIB_H_IN
+#define LIB_H_IN
+
+#ifdef __OBJC__
+@class NSString;                                               
+#else
+class NSString;
+#endif
+
+struct Request {
+  NSString * m_request = nullptr;
+};
+
+#endif // _H_IN
+
+#--- main.cpp
+#include "lib.h"
+
+void process(Request *);
+
+Request r;
+
+int main() {
+    process(&r);
+    return 0;
+}
+
+#--- request.m
+#import <Foundation/Foundation.h>
+
+#include "lib.h"
+
+void process(Request * r) {
+  r->m_request = @"Hello, World!";
+}
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 055a787371ba..72fa734d5dd1 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -180,28 +180,29 @@ if ("flang" IN_LIST LLVM_ENABLE_PROJECTS)
 endif()
 
 if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
-  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated.  Please use "
+  message(WARNING "Using LLVM_ENABLE_PROJECTS=libc is deprecated now, and will "
+    "become a fatal error in a future release. Please use "
     "-DLLVM_ENABLE_RUNTIMES=libc or see the instructions at "
     "https://libc.llvm.org/ for building the runtimes.")
 endif()
 
 if ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=compiler-rt is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=compiler-rt or see the instructions at "
     "https://compiler-rt.llvm.org/ for building the runtimes.")
 endif()
 
 if ("offload" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=offload is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=offload or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
 
 if ("openmp" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=openmp is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=openmp or see the instructions at "
     "https://openmp.llvm.org/ for building the runtimes.")
 endif()
@@ -214,7 +215,7 @@ endif ()
 
 if ("libclc" IN_LIST LLVM_ENABLE_PROJECTS)
   message(WARNING "Using LLVM_ENABLE_PROJECTS=libclc is deprecated now, and will "
-    "become a fatal error in the LLVM 21 release.  Please use "
+    "become a fatal error in a future release.  Please use "
     "-DLLVM_ENABLE_RUNTIMES=libclc or see the instructions at "
     "https://libclc.llvm.org/ for building the runtimes.")
 endif()
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 6191a915051f..8e97f25ba5e4 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -154,10 +154,23 @@ Changes to the MIPS Backend
 ---------------------------
 
 * `-mcpu=i6400` and `-mcpu=i6500` were added.
+* Added support for `mipsel-windows-gnu` and `mipsel-windows-msvc` targets.
 
 Changes to the PowerPC Backend
 ------------------------------
 
+* Add spill and restore for DMR and DMRp registers.
+* Prototype various Dense Math Facility instructions, and intrinsics for basic enablement, insert/extract, integer and FP calculations.
+* Add prototype for Dense Math Facility cryptography instructions.
+* Implement load/stores prototype for v1024i1, v2048i1.
+* Support conversion between f16 and f128.
+* Change default for auto gen stxvp for cpu=future.
+* Setup initial JITLink build support for XCOFF.
+* Add an API to derive the default feature set from a CPU name within the TargetParser
+  (e.g. `pwr10` -> `+vsx`,`+isa3_1`,`+mma`). Clang now uses this to populate the `target-feature`
+  list when `-mcpu` is provided for PowerPC.
+* Various bug fixes and codegen improvements.
+
 Changes to the RISC-V Backend
 -----------------------------
 
@@ -219,6 +232,7 @@ Changes to the RISC-V Backend
 * Removed -mattr=+no-rvc-hints that could be used to disable parsing and generation of RVC hints.
 * Adds assembler support for the Andes `XAndesvsintload` (Andes Vector INT4 Load extension).
 * Adds assembler support for the Andes `XAndesbfhcvt` (Andes Scalar BFLOAT16 Conversion extension).
+* Add combine for shadd family of instructions.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -228,11 +242,14 @@ Changes to the Windows Target
 
 * `fp128` is now passed indirectly, meaning it uses the same calling convention
   as `i128`.
+* Added support for `mipsel-windows-gnu` and `mipsel-windows-msvc` targets.
 
 Changes to the X86 Backend
 --------------------------
 
 * `fp128` will now use `*f128` libcalls on 32-bit GNU targets as well.
+* On x86-32, `fp128` and `i128` are now passed with the expected 16-byte stack
+  alignment.
 
 Changes to the OCaml bindings
 -----------------------------
@@ -311,10 +328,21 @@ Changes to LLDB
     stop reason = SIGSEGV: sent by tkill system call (sender pid=649752, uid=2667987)
   ```
 * ELF Cores can now have their siginfo structures inspected using `thread siginfo`.
+* LLDB now uses
+  [DIL](https://discourse.llvm.org/t/rfc-data-inspection-language/69893) as the
+  default implementation for 'frame variable'. This should not change the
+  behavior of 'frame variable' at all, at this time. To revert to using the
+  old implementation use: `settings set target.experimental.use-DIL false`.
 * Disassembly of unknown instructions now produces `<unknown>` instead of
   nothing at all
 * Changed the format of opcode bytes to match llvm-objdump when disassembling
   RISC-V code with `disassemble`'s `--byte` option.
+* LLDB added native support for the Model Context Protocol  (MCP). An MCP
+  server can be started with the `protocol-server start MCP` command.
+* On AArch64 Linux, LLDB will now show the state of the `STORE_ONLY` field of
+  `mte_ctrl`. This will only be shown on hardware that has the
+  `FEAT_MTE_STORE_ONLY` architecture feature.
+
 
 ### Changes to lldb-dap
 
@@ -329,6 +357,18 @@ Changes to Sanitizers
 
 Other Changes
 -------------
+* A new ThinLTO backend has been added to implement the
+  [Integrated Distributed ThinLTO](https://llvm.org/docs/DTLTO.html) (DTLTO)
+  feature. This new backend delegates the ThinLTO backend compilation jobs to an
+  external process (the distributor), which in turn coordinates distribution
+  through a system such as Incredibuild. A JSON interface is used for
+  communication with the distributor.
+  ([#47468](https://github.com/llvm/llvm-project/issues/47468)).
+
+Changes to the Profile Runtime
+---------------------
+
+* On AIX, avoid using mmap when reading profile files from a non-local filesystem.
 
 External Open Source Projects Using LLVM {{env.config.release}}
 ===============================================================
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index b985292ccee4..1dc73205a0eb 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -47,6 +47,8 @@ enum class RecurKind {
   FMul,     ///< Product of floats.
   FMin,     ///< FP min implemented in terms of select(cmp()).
   FMax,     ///< FP max implemented in terms of select(cmp()).
+  FMinNum,  ///< FP min with llvm.minnum semantics including NaNs.
+  FMaxNum,  ///< FP max with llvm.maxnum semantics including NaNs.
   FMinimum, ///< FP min with llvm.minimum semantics
   FMaximum, ///< FP max with llvm.maximum semantics
   FMinimumNum, ///< FP min with llvm.minimumnum semantics
@@ -250,6 +252,7 @@ class RecurrenceDescriptor {
   /// Returns true if the recurrence kind is a floating-point min/max kind.
   static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
     return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
+           Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum ||
            Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
            Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
   }
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 938d71dd030e..9e3d9196cc18 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -323,10 +323,11 @@ class MachineBasicBlock
   const MachineFunction *getParent() const { return xParent; }
   MachineFunction *getParent() { return xParent; }
 
-  /// Returns true if the original IR terminator is an `indirectbr`. This
-  /// typically corresponds to a `goto` in C, rather than jump tables.
-  bool terminatorIsComputedGoto() const {
-    return back().isIndirectBranch() &&
+  /// Returns true if the original IR terminator is an `indirectbr` with
+  /// successor blocks. This typically corresponds to a `goto` in C, rather than
+  /// jump tables.
+  bool terminatorIsComputedGotoWithSuccessors() const {
+    return back().isIndirectBranch() && !succ_empty() &&
            llvm::all_of(successors(), [](const MachineBasicBlock *Succ) {
              return Succ->isIRBlockAddressTaken();
            });
diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index cdc80c88b742..611bfe3f8ace 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -795,25 +795,9 @@ bool ConstructDecompositionT<C, H>::applyClause(
   // assigned to which leaf constructs.
 
   // [5.2:340:33]
-  auto canMakePrivateCopy = [](llvm::omp::Clause id) {
-    switch (id) {
-    // Clauses with "privatization" property:
-    case llvm::omp::Clause::OMPC_firstprivate:
-    case llvm::omp::Clause::OMPC_in_reduction:
-    case llvm::omp::Clause::OMPC_lastprivate:
-    case llvm::omp::Clause::OMPC_linear:
-    case llvm::omp::Clause::OMPC_private:
-    case llvm::omp::Clause::OMPC_reduction:
-    case llvm::omp::Clause::OMPC_task_reduction:
-      return true;
-    default:
-      return false;
-    }
-  };
-
   bool applied = applyIf(node, [&](const auto &leaf) {
     return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
-      return canMakePrivateCopy(n->id);
+      return llvm::omp::isPrivatizingClause(n->id);
     });
   });
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index 35dafc6d246f..d44c33301bde 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -48,6 +48,22 @@ static constexpr inline bool canHaveIterator(Clause C) {
   }
 }
 
+// Can clause C create a private copy of a variable.
+static constexpr inline bool isPrivatizingClause(Clause C) {
+  switch (C) {
+  case OMPC_firstprivate:
+  case OMPC_in_reduction:
+  case OMPC_lastprivate:
+  case OMPC_linear:
+  case OMPC_private:
+  case OMPC_reduction:
+  case OMPC_task_reduction:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static constexpr unsigned FallbackVersion = 52;
 LLVM_ABI ArrayRef<unsigned> getOpenMPVersions();
 
diff --git a/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h b/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h
index 8aac9d5b49db..448a6e913eb8 100644
--- a/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h
+++ b/llvm/include/llvm/IR/GenericFloatingPointPredicateUtils.h
@@ -135,6 +135,12 @@ template <typename ContextT> class GenericFloatingPointPredicateUtils {
       if (Mode.Input != DenormalMode::IEEE)
         return {Invalid, fcAllFlags, fcAllFlags};
 
+      auto ExactClass = [IsFabs, Src](FPClassTest Mask) {
+        if (IsFabs)
+          Mask = llvm::inverse_fabs(Mask);
+        return exactClass(Src, Mask);
+      };
+
       switch (Pred) {
       case FCmpInst::FCMP_OEQ: // Match x == 0.0
         return exactClass(Src, fcZero);
@@ -151,26 +157,24 @@ template <typename ContextT> class GenericFloatingPointPredicateUtils {
       case FCmpInst::FCMP_UNO:
         return exactClass(Src, fcNan);
       case FCmpInst::FCMP_OGT: // x > 0
-        return exactClass(Src, fcPosSubnormal | fcPosNormal | fcPosInf);
+        return ExactClass(fcPosSubnormal | fcPosNormal | fcPosInf);
       case FCmpInst::FCMP_UGT: // isnan(x) || x > 0
-        return exactClass(Src, fcPosSubnormal | fcPosNormal | fcPosInf | fcNan);
+        return ExactClass(fcPosSubnormal | fcPosNormal | fcPosInf | fcNan);
       case FCmpInst::FCMP_OGE: // x >= 0
-        return exactClass(Src, fcPositive | fcNegZero);
+        return ExactClass(fcPositive | fcNegZero);
       case FCmpInst::FCMP_UGE: // isnan(x) || x >= 0
-        return exactClass(Src, fcPositive | fcNegZero | fcNan);
+        return ExactClass(fcPositive | fcNegZero | fcNan);
       case FCmpInst::FCMP_OLT: // x < 0
-        return exactClass(Src, fcNegSubnormal | fcNegNormal | fcNegInf);
+        return ExactClass(fcNegSubnormal | fcNegNormal | fcNegInf);
       case FCmpInst::FCMP_ULT: // isnan(x) || x < 0
-        return exactClass(Src, fcNegSubnormal | fcNegNormal | fcNegInf | fcNan);
+        return ExactClass(fcNegSubnormal | fcNegNormal | fcNegInf | fcNan);
       case FCmpInst::FCMP_OLE: // x <= 0
-        return exactClass(Src, fcNegative | fcPosZero);
+        return ExactClass(fcNegative | fcPosZero);
       case FCmpInst::FCMP_ULE: // isnan(x) || x <= 0
-        return exactClass(Src, fcNegative | fcPosZero | fcNan);
+        return ExactClass(fcNegative | fcPosZero | fcNan);
       default:
         llvm_unreachable("all compare types are handled");
       }
-
-      return {Invalid, fcAllFlags, fcAllFlags};
     }
 
     const bool IsDenormalRHS = (OrigClass & fcSubnormal) == OrigClass;
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index a6254eafa490..9b081e9d6544 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -2120,7 +2120,7 @@ defvar X86CommonLibcalls =
 );
 
 defvar Windows32DivRemMulCalls =
-  LibcallImpls<(add WindowsDivRemMulLibcalls),
+  LibcallsWithCC<(add WindowsDivRemMulLibcalls), X86_STDCALL,
   RuntimeLibcallPredicate<"TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()">>;
 
 def X86_32SystemLibrary
diff --git a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 3a7ca1a69ab8..cae2fbcac1fe 100644
--- a/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/llvm/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -136,6 +136,18 @@ class LLVM_ABI MCDisassembler {
                                       ArrayRef<uint8_t> Bytes, uint64_t Address,
                                       raw_ostream &CStream) const = 0;
 
+  /// Returns the disassembly of an instruction bundle for VLIW architectures
+  /// like Hexagon.
+  ///
+  /// \param Instr    - An MCInst to populate with the contents of
+  /// the Bundle with sub-instructions encoded as Inst operands.
+  virtual DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+                                            ArrayRef<uint8_t> Bytes,
+                                            uint64_t Address,
+                                            raw_ostream &CStream) const {
+    return Fail;
+  }
+
   /// Used to perform separate target specific disassembly for a particular
   /// symbol. May parse any prelude that precedes instructions after the
   /// start of a symbol, or the entire symbol.
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 657f4230379e..1c988825d796 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -197,7 +197,8 @@ class Triple {
     SUSE,
     OpenEmbedded,
     Intel,
-    LastVendorType = Intel
+    Amazon,
+    LastVendorType = Amazon
   };
   enum OSType {
     UnknownOS,
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 39f74beca082..8be5de3bf356 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -941,10 +941,30 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
                   m_Intrinsic<Intrinsic::minimumnum>(m_Value(), m_Value())) ||
             match(I, m_Intrinsic<Intrinsic::maximumnum>(m_Value(), m_Value()));
     };
-    if (isIntMinMaxRecurrenceKind(Kind) ||
-        (HasRequiredFMF() && isFPMinMaxRecurrenceKind(Kind)))
+    if (isIntMinMaxRecurrenceKind(Kind))
       return isMinMaxPattern(I, Kind, Prev);
-    else if (isFMulAddIntrinsic(I))
+    if (isFPMinMaxRecurrenceKind(Kind)) {
+      InstDesc Res = isMinMaxPattern(I, Kind, Prev);
+      if (!Res.isRecurrence())
+        return InstDesc(false, I);
+      if (HasRequiredFMF())
+        return Res;
+      // We may be able to vectorize FMax/FMin reductions using maxnum/minnum
+      // intrinsics with extra checks ensuring the vector loop handles only
+      // non-NaN inputs.
+      if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value()))) {
+        assert(Kind == RecurKind::FMax &&
+               "unexpected recurrence kind for maxnum");
+        return InstDesc(I, RecurKind::FMaxNum);
+      }
+      if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value()))) {
+        assert(Kind == RecurKind::FMin &&
+               "unexpected recurrence kind for minnum");
+        return InstDesc(I, RecurKind::FMinNum);
+      }
+      return InstDesc(false, I);
+    }
+    if (isFMulAddIntrinsic(I))
       return InstDesc(Kind == RecurKind::FMulAdd, I,
                       I->hasAllowReassoc() ? nullptr : I);
     return InstDesc(false, I);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 9bbb89e37865..3d1408256df8 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2096,6 +2096,10 @@ static bool isRemOfLoopIncrementWithLoopInvariant(
   if (!L->isLoopInvariant(RemAmt))
     return false;
 
+  // Only works if the AddOffset is a loop invaraint
+  if (AddOffset && !L->isLoopInvariant(AddOffset))
+    return false;
+
   // Is the PHI a loop increment?
   auto LoopIncrInfo = getIVIncrement(PN, LI);
   if (!LoopIncrInfo)
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index b38a4d1c55af..90005bd181f3 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -4279,8 +4279,8 @@ void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
             !TII->isGlobalMemoryObject(FromMI) &&
             !TII->isGlobalMemoryObject(ToMI) && !isSuccOrder(From, To)) {
           SDep Pred = Dep;
-          Pred.setSUnit(Src);
-          Dst->addPred(Pred);
+          Pred.setSUnit(From);
+          To->addPred(Pred);
         }
       }
     }
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 8de2c48581a1..9fa96e737296 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -587,12 +587,14 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
       break;
     case Intrinsic::exp:
     case Intrinsic::exp2:
+    case Intrinsic::log:
       Changed |= forEachCall(F, [&](CallInst *CI) {
         Type *Ty = CI->getArgOperand(0)->getType();
         if (!isa<ScalableVectorType>(Ty))
           return false;
         const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
         unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
+        assert(Op != ISD::DELETED_NODE && "unsupported intrinsic");
         if (!TL->isOperationExpand(Op, EVT::getEVT(Ty)))
           return false;
         return lowerUnaryVectorIntrinsicAsLoop(M, CI);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 23812d795f5f..91fd2d843f44 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16717,7 +16717,8 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
   // Fold freeze(op(x, ...)) -> op(freeze(x), ...).
   // Try to push freeze through instructions that propagate but don't produce
   // poison as far as possible. If an operand of freeze follows three
-  // conditions 1) one-use, and 2) does not produce poison then push
+  // conditions 1) one-use, 2) does not produce poison, and 3) has all but one
+  // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push
   // the freeze through to the operands that are not guaranteed non-poison.
   // NOTE: we will strip poison-generating flags, so ignore them here.
   if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false,
@@ -16725,6 +16726,18 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       N0->getNumValues() != 1 || !N0->hasOneUse())
     return SDValue();
 
+  // TOOD: we should always allow multiple operands, however this increases the
+  // likelihood of infinite loops due to the ReplaceAllUsesOfValueWith call
+  // below causing later nodes that share frozen operands to fold again and no
+  // longer being able to confirm other operands are not poison due to recursion
+  // depth limits on isGuaranteedNotToBeUndefOrPoison.
+  bool AllowMultipleMaybePoisonOperands =
+      N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC ||
+      N0.getOpcode() == ISD::BUILD_VECTOR ||
+      N0.getOpcode() == ISD::BUILD_PAIR ||
+      N0.getOpcode() == ISD::VECTOR_SHUFFLE ||
+      N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL;
+
   // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all
   // ones" or "constant" into something that depends on FrozenUndef. We can
   // instead pick undef values to keep those properties, while at the same time
@@ -16757,6 +16770,10 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) {
       MaybePoisonOperandNumbers.push_back(OpNo);
     if (!HadMaybePoisonOperands)
       continue;
+    if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) {
+      // Multiple maybe-poison ops when not allowed - bail out.
+      return SDValue();
+    }
   }
   // NOTE: the whole op may be not guaranteed to not be undef or poison because
   // it could create undef or poison due to it's poison-generating flags.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2cad36eff9c8..fe357106bda6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1551,6 +1551,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
 
   case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::AssertNoFPClass: ExpandFloatRes_AssertNoFPClass(N, Lo, Hi); break;
   case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
   case ISD::STRICT_FMINNUM:
   case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
@@ -1966,6 +1967,13 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
   Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo,
+                                                      SDValue &Hi) {
+  // TODO: Handle ppcf128 by preserving AssertNoFPClass for one of the halves.
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9b537248d4ab..4eaa79890e00 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -677,6 +677,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
       SDNode *N, RTLIB::Libcall LC, std::optional<unsigned> CallRetResNo = {});
 
   // clang-format off
+  void ExpandFloatRes_AssertNoFPClass(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FACOS     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FASIN     (SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index a88c57fdc165..8cbdadd97981 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -604,12 +604,23 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   bool HasComputedGoto = false;
   if (!TailBB.empty()) {
     HasIndirectbr = TailBB.back().isIndirectBranch();
-    HasComputedGoto = TailBB.terminatorIsComputedGoto();
+    HasComputedGoto = TailBB.terminatorIsComputedGotoWithSuccessors();
   }
 
   if (HasIndirectbr && PreRegAlloc)
     MaxDuplicateCount = TailDupIndirectBranchSize;
 
+  // Allow higher limits when the block has computed-gotos and running after
+  // register allocation. NB. This basically unfactors computed gotos that were
+  // factored early on in the compilation process to speed up edge based data
+  // flow. If we do not unfactor them again, it can seriously pessimize code
+  // with many computed jumps in the source code, such as interpreters.
+  // Therefore we do not restrict the computed gotos.
+  bool DupComputedGotoLate =
+      HasComputedGoto && MF->getTarget().getTargetTriple().isOSDarwin();
+  if (DupComputedGotoLate && !PreRegAlloc)
+    MaxDuplicateCount = std::max(MaxDuplicateCount, 10u);
+
   // Check the instructions in the block to determine whether tail-duplication
   // is invalid or unlikely to be profitable.
   unsigned InstrCount = 0;
@@ -663,12 +674,10 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
   // Duplicating a BB which has both multiple predecessors and successors will
   // may cause huge amount of PHI nodes. If we want to remove this limitation,
   // we have to address https://github.com/llvm/llvm-project/issues/78578.
-  // NB. This basically unfactors computed gotos that were factored early on in
-  // the compilation process to speed up edge based data flow. If we do not
-  // unfactor them again, it can seriously pessimize code with many computed
-  // jumps in the source code, such as interpreters. Therefore we do not
-  // restrict the computed gotos.
-  if (!HasComputedGoto && TailBB.pred_size() > TailDupPredSize &&
+  bool CheckSuccessorAndPredecessorSize =
+      DupComputedGotoLate ? PreRegAlloc : !HasComputedGoto;
+  if (CheckSuccessorAndPredecessorSize &&
+      TailBB.pred_size() > TailDupPredSize &&
       TailBB.succ_size() > TailDupSuccSize) {
     // If TailBB or any of its successors contains a phi, we may have to add a
     // large number of additional phis with additional incoming values.
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6feeb19bb858..ca2a57e9b7b2 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -806,7 +806,17 @@ void TargetLoweringBase::initActions() {
                         ISD::SDIVFIX,        ISD::SDIVFIXSAT,
                         ISD::UDIVFIX,        ISD::UDIVFIXSAT,
                         ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT,
-                        ISD::IS_FPCLASS},
+                        ISD::IS_FPCLASS,     ISD::FCBRT,
+                        ISD::FLOG,           ISD::FLOG2,
+                        ISD::FLOG10,         ISD::FEXP,
+                        ISD::FEXP2,          ISD::FEXP10,
+                        ISD::FFLOOR,         ISD::FNEARBYINT,
+                        ISD::FCEIL,          ISD::FRINT,
+                        ISD::FTRUNC,         ISD::FROUNDEVEN,
+                        ISD::FTAN,           ISD::FACOS,
+                        ISD::FASIN,          ISD::FATAN,
+                        ISD::FCOSH,          ISD::FSINH,
+                        ISD::FTANH,          ISD::FATAN2},
                        VT, Expand);
 
     // Overflow operations default to expand
@@ -852,13 +862,12 @@ void TargetLoweringBase::initActions() {
 
     // These operations default to expand for vector types.
     if (VT.isVector())
-      setOperationAction(
-          {ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG, ISD::ANY_EXTEND_VECTOR_INREG,
-           ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
-           ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
-           ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
-           ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
-          VT, Expand);
+      setOperationAction({ISD::FCOPYSIGN, ISD::SIGN_EXTEND_INREG,
+                          ISD::ANY_EXTEND_VECTOR_INREG,
+                          ISD::SIGN_EXTEND_VECTOR_INREG,
+                          ISD::ZERO_EXTEND_VECTOR_INREG, ISD::SPLAT_VECTOR,
+                          ISD::LRINT, ISD::LLRINT, ISD::LROUND, ISD::LLROUND},
+                         VT, Expand);
 
       // Constrained floating-point operations default to expand.
 #define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
@@ -914,15 +923,6 @@ void TargetLoweringBase::initActions() {
                      {MVT::bf16, MVT::f16, MVT::f32, MVT::f64, MVT::f80, MVT::f128},
                      Expand);
 
-  // These library functions default to expand.
-  setOperationAction({ISD::FCBRT,      ISD::FLOG,  ISD::FLOG2,  ISD::FLOG10,
-                      ISD::FEXP,       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
-                      ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT,  ISD::FTRUNC,
-                      ISD::FROUNDEVEN, ISD::FTAN,  ISD::FACOS,  ISD::FASIN,
-                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH,
-                      ISD::FATAN2},
-                     {MVT::f32, MVT::f64, MVT::f128}, Expand);
-
   // Insert custom handling default for llvm.canonicalize.*.
   setOperationAction(ISD::FCANONICALIZE,
                      {MVT::f16, MVT::f32, MVT::f64, MVT::f128}, Expand);
@@ -1922,6 +1922,8 @@ int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const {
     return ISD::FEXP;
   case Intrinsic::exp2:
     return ISD::FEXP2;
+  case Intrinsic::log:
+    return ISD::FLOG;
   default:
     return ISD::DELETED_NODE;
   }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 8004077b9266..9afbefb8c08e 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -3185,12 +3185,6 @@ void Verifier::visitFunction(const Function &F) {
     CheckDI(SP->describes(&F),
             "!dbg attachment points at wrong subprogram for function", N, &F,
             &I, DL, Scope, SP);
-
-    if (DL->getAtomGroup())
-      CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
-              "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
-              "Instructions enabled",
-              DL, DL->getScope()->getSubprogram());
   };
   for (auto &BB : F)
     for (auto &I : BB) {
@@ -5492,6 +5486,15 @@ void Verifier::visitInstruction(Instruction &I) {
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     CheckDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
     visitMDNode(*N, AreDebugLocsAllowed::Yes);
+
+    if (auto *DL = dyn_cast<DILocation>(N)) {
+      if (DL->getAtomGroup()) {
+        CheckDI(DL->getScope()->getSubprogram()->getKeyInstructionsEnabled(),
+                "DbgLoc uses atomGroup but DISubprogram doesn't have Key "
+                "Instructions enabled",
+                DL, DL->getScope()->getSubprogram());
+      }
+    }
   }
 
   if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index a19659174409..a877eb7938eb 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -361,6 +361,9 @@ static void attemptToFoldSymbolOffsetDifference(const MCAssembler *Asm,
         if (BBeforeRelax && AAfterRelax)
           return;
       }
+      const auto *RF = dyn_cast<MCRelaxableFragment>(F);
+      if (RF && RF->isLinkerRelaxable())
+        return;
       if (&*F == FA) {
         // If FA and FB belong to the same subsection, the loop will find FA and
         // we can resolve the difference.
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index c59fabe5df1b..28cddb939646 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -70,6 +70,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
       OS << "\n  Fixup @" << F.getOffset() << " Value:";
       F.getValue()->print(OS, nullptr);
       OS << " Kind:" << F.getKind();
+      if (F.isLinkerRelaxable())
+        OS << " LinkerRelaxable";
     }
   };
 
@@ -113,6 +115,8 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   }
   case MCFragment::FT_Relaxable:  {
     const auto *F = cast<MCRelaxableFragment>(this);
+    if (F->isLinkerRelaxable())
+      OS << " LinkerRelaxable";
     OS << " Size:" << F->getContents().size() << ' ';
     F->getInst().dump_pretty(OS);
     printFixups(F->getFixups());
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 44a82f75576b..d9c39bbedf37 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -408,6 +408,13 @@ void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
       Inst, IF->getContentsForAppending(), Fixups, STI);
   IF->doneAppending();
   IF->appendFixups(Fixups);
+
+  for (auto &Fixup : Fixups) {
+    if (Fixup.isLinkerRelaxable()) {
+      IF->setLinkerRelaxable();
+      getCurrentSectionOnly()->setLinkerRelaxable();
+    }
+  }
 }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index beb472b7c7de..a7330692571d 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -63,6 +63,8 @@ LLVM_DUMP_METHOD void MCSection::dump(
   raw_ostream &OS = errs();
 
   OS << "MCSection Name:" << getName();
+  if (isLinkerRelaxable())
+    OS << " LinkerRelaxable";
   for (auto &F : *this) {
     OS << '\n';
     F.dump();
diff --git a/llvm/lib/ObjCopy/COFF/COFFReader.cpp b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
index 62a71d41ded5..9b55f76e5840 100644
--- a/llvm/lib/ObjCopy/COFF/COFFReader.cpp
+++ b/llvm/lib/ObjCopy/COFF/COFFReader.cpp
@@ -135,7 +135,7 @@ Error COFFReader::readSymbols(Object &Obj, bool IsBigObj) const {
     // it is, find the target section unique id.
     const coff_aux_section_definition *SD = SymRef.getSectionDefinition();
     const coff_aux_weak_external *WE = SymRef.getWeakExternal();
-    if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (SD && SD->Selection == IMAGE_COMDAT_SELECT_ASSOCIATIVE && !Obj.IsPE) {
       int32_t Index = SD->getNumber(IsBigObj);
       if (Index <= 0 || static_cast<uint32_t>(Index - 1) >= Sections.size())
         return createStringError(object_error::parse_failed,
diff --git a/llvm/lib/Support/BLAKE3/blake3_dispatch.c b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
index d00580fe3519..19918aa708b2 100644
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
@@ -236,7 +236,7 @@ void blake3_xof_many(const uint32_t cv[8],
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
   MAYBE_UNUSED(features);
-#if !defined(_WIN32) && !defined(BLAKE3_NO_AVX512)
+#if !defined(_WIN32) && !defined(__CYGWIN__) && !defined(BLAKE3_NO_AVX512)
   if (features & AVX512VL) {
     blake3_xof_many_avx512(cv, block, block_len, counter, flags, out, outblocks);
     return;
diff --git a/llvm/lib/Support/BLAKE3/blake3_impl.h b/llvm/lib/Support/BLAKE3/blake3_impl.h
index deed079e468a..dd71e729f208 100644
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
@@ -324,7 +324,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
                              uint8_t flags, uint8_t flags_start,
                              uint8_t flags_end, uint8_t *out);
 
-#if !defined(_WIN32)
+#if !defined(_WIN32) && !defined(__CYGWIN__)
 LLVM_LIBRARY_VISIBILITY
 void blake3_xof_many_avx512(const uint32_t cv[8],
                             const uint8_t block[BLAKE3_BLOCK_LEN],
diff --git a/llvm/lib/Support/BLAKE3/blake3_neon.c b/llvm/lib/Support/BLAKE3/blake3_neon.c
index 9629e1083686..ee36721f8757 100644
--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
@@ -245,10 +245,11 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
       counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
 }
 
-void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
-                       const uint32_t key[8], uint64_t counter,
-                       bool increment_counter, uint8_t flags,
-                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+static void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
+                              const uint32_t key[8], uint64_t counter,
+                              bool increment_counter, uint8_t flags,
+                              uint8_t flags_start, uint8_t flags_end,
+                              uint8_t *out) {
   uint32x4_t h_vecs[8] = {
       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
diff --git a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
index d5be360815ad..d24657465dd8 100644
--- a/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
+++ b/llvm/lib/Support/BLAKE3/llvm_blake3_prefix.h
@@ -10,7 +10,9 @@
 #define blake3_hasher llvm_blake3_hasher
 #define blake3_chunk_state llvm_blake3_chunk_state
 #define blake3_compress_in_place llvm_blake3_compress_in_place
+#define blake3_compress_subtree_wide llvm_blake3_compress_subtree_wide
 #define blake3_compress_xof llvm_blake3_compress_xof
+#define blake3_xof_many llvm_blake3_xof_many
 #define blake3_hash_many llvm_blake3_hash_many
 #define blake3_simd_degree llvm_blake3_simd_degree
 #define blake3_compress_in_place_portable llvm_blake3_compress_in_place_portable
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index 601f11f6d23c..1c4645ad8364 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -501,8 +501,14 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize,
     std::unique_ptr<MB> Result(
         new (NamedBufferAlloc(Filename)) MemoryBufferMMapFile<MB>(
             RequiresNullTerminator, FD, MapSize, Offset, EC));
-    if (!EC)
-      return std::move(Result);
+    if (!EC) {
+      // On at least Linux, and possibly on other systems, mmap may return pages
+      // from the page cache that are not properly filled with trailing zeroes,
+      // if some prior user of the page wrote non-zero bytes. Detect this and
+      // don't use mmap in that case.
+      if (!RequiresNullTerminator || *Result->getBufferEnd() == '\0')
+        return std::move(Result);
+    }
   }
 
 #ifdef __MVS__
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index c4b43e1b5126..4bd650b37792 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -2195,13 +2195,24 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   if (BrTarget == AddrDisc)
     report_fatal_error("Branch target is signed with its own value");
 
-  // If we are printing BLRA pseudo instruction, then x16 and x17 are
-  // implicit-def'ed by the MI and AddrDisc is not used as any other input, so
-  // try to save one MOV by setting MayUseAddrAsScratch.
+  // If we are printing BLRA pseudo, try to save one MOV by making use of the
+  // fact that x16 and x17 are described as clobbered by the MI instruction and
+  // AddrDisc is not used as any other input.
+  //
+  // Back in the day, emitPtrauthDiscriminator was restricted to only returning
+  // either x16 or x17, meaning the returned register is always among the
+  // implicit-def'ed registers of BLRA pseudo. Now this property can be violated
+  // if isX16X17Safer predicate is false, thus manually check if AddrDisc is
+  // among x16 and x17 to prevent clobbering unexpected registers.
+  //
   // Unlike BLRA, BRA pseudo is used to perform computed goto, and thus not
   // declared as clobbering x16/x17.
+  //
+  // FIXME: Make use of `killed` flags and register masks instead.
+  bool AddrDiscIsImplicitDef =
+      IsCall && (AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17);
   Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, AArch64::X17,
-                                              /*MayUseAddrAsScratch=*/IsCall);
+                                              AddrDiscIsImplicitDef);
   bool IsZeroDisc = DiscReg == AArch64::XZR;
 
   unsigned Opc;
@@ -2930,8 +2941,15 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
     // See the comments in emitPtrauthBranch.
     if (Callee == AddrDisc)
       report_fatal_error("Call target is signed with its own value");
+
+    // After isX16X17Safer predicate was introduced, emitPtrauthDiscriminator is
+    // no longer restricted to only reusing AddrDisc when it is X16 or X17
+    // (which are implicit-def'ed by AUTH_TCRETURN pseudos), thus impose this
+    // restriction manually not to clobber an unexpected register.
+    bool AddrDiscIsImplicitDef =
+        AddrDisc == AArch64::X16 || AddrDisc == AArch64::X17;
     Register DiscReg = emitPtrauthDiscriminator(Disc, AddrDisc, ScratchReg,
-                                                /*MayUseAddrAsScratch=*/true);
+                                                AddrDiscIsImplicitDef);
 
     const bool IsZero = DiscReg == AArch64::XZR;
     const unsigned Opcodes[2][2] = {{AArch64::BRAA, AArch64::BRAAZ},
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 3436dc9ef452..137ff898e86a 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -30,6 +30,14 @@ using namespace llvm;
 #define AARCH64_BRANCH_TARGETS_NAME "AArch64 Branch Targets"
 
 namespace {
+// BTI HINT encoding: base (32) plus 'c' (2) and/or 'j' (4).
+enum : unsigned {
+  BTIBase = 32,   // Base immediate for BTI HINT
+  BTIC = 1u << 1, // 2
+  BTIJ = 1u << 2, // 4
+  BTIMask = BTIC | BTIJ,
+};
+
 class AArch64BranchTargets : public MachineFunctionPass {
 public:
   static char ID;
@@ -42,6 +50,7 @@ class AArch64BranchTargets : public MachineFunctionPass {
   void addBTI(MachineBasicBlock &MBB, bool CouldCall, bool CouldJump,
               bool NeedsWinCFI);
 };
+
 } // end anonymous namespace
 
 char AArch64BranchTargets::ID = 0;
@@ -62,9 +71,8 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
   if (!MF.getInfo<AArch64FunctionInfo>()->branchTargetEnforcement())
     return false;
 
-  LLVM_DEBUG(
-      dbgs() << "********** AArch64 Branch Targets  **********\n"
-             << "********** Function: " << MF.getName() << '\n');
+  LLVM_DEBUG(dbgs() << "********** AArch64 Branch Targets  **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
   const Function &F = MF.getFunction();
 
   // LLVM does not consider basic blocks which are the targets of jump tables
@@ -103,6 +111,12 @@ bool AArch64BranchTargets::runOnMachineFunction(MachineFunction &MF) {
         JumpTableTargets.count(&MBB))
       CouldJump = true;
 
+    if (MBB.isEHPad()) {
+      if (HasWinCFI && (MBB.isEHFuncletEntry() || MBB.isCleanupFuncletEntry()))
+        CouldCall = true;
+      else
+        CouldJump = true;
+    }
     if (CouldCall || CouldJump) {
       addBTI(MBB, CouldCall, CouldJump, HasWinCFI);
       MadeChange = true;
@@ -130,7 +144,12 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
 
   auto MBBI = MBB.begin();
 
-  // Skip the meta instructions, those will be removed anyway.
+  // If the block starts with EH_LABEL(s), skip them first.
+  while (MBBI != MBB.end() && MBBI->isEHLabel()) {
+    ++MBBI;
+  }
+
+  // Skip meta/CFI/etc. (and EMITBKEY) to reach the first executable insn.
   for (; MBBI != MBB.end() &&
          (MBBI->isMetaInstruction() || MBBI->getOpcode() == AArch64::EMITBKEY);
        ++MBBI)
@@ -138,16 +157,21 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
 
   // SCTLR_EL1.BT[01] is set to 0 by default which means
   // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there.
-  if (MBBI != MBB.end() && HintNum == 34 &&
+  if (MBBI != MBB.end() && ((HintNum & BTIMask) == BTIC) &&
       (MBBI->getOpcode() == AArch64::PACIASP ||
        MBBI->getOpcode() == AArch64::PACIBSP))
     return;
 
-  if (HasWinCFI && MBBI->getFlag(MachineInstr::FrameSetup)) {
-    BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
-            TII->get(AArch64::SEH_Nop));
+  // Insert BTI exactly at the first executable instruction.
+  const DebugLoc DL = MBB.findDebugLoc(MBBI);
+  MachineInstr *BTI = BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT))
+                          .addImm(HintNum)
+                          .getInstr();
+
+  // WinEH: put .seh_nop after BTI when the first real insn is FrameSetup.
+  if (HasWinCFI && MBBI != MBB.end() &&
+      MBBI->getFlag(MachineInstr::FrameSetup)) {
+    auto AfterBTI = std::next(MachineBasicBlock::iterator(BTI));
+    BuildMI(MBB, AfterBTI, DL, TII->get(AArch64::SEH_Nop));
   }
-  BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
-          TII->get(AArch64::HINT))
-      .addImm(HintNum);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 36f3a670808d..7de66ccbf6f2 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -671,8 +671,8 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   }
 
   if (PRFX) {
-    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
     transferImpOps(MI, PRFX, DOP);
+    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
   } else
     transferImpOps(MI, DOP, DOP);
 
@@ -1591,18 +1591,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
          "Non-writeback variants of STGloop / STZGloop should not "
          "survive past PrologEpilogInserter.");
    case AArch64::STR_ZZZZXI:
+   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
    case AArch64::STR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
    case AArch64::STR_ZZXI:
+   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
    case AArch64::STR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
    case AArch64::LDR_ZZZZXI:
+   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
    case AArch64::LDR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
+   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
    case AArch64::LDR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa7..5420545cc3ce 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2482,8 +2482,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LDR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZXI:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDRBBui:
   case AArch64::LDRBui:
   case AArch64::LDRDui:
@@ -2525,8 +2527,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STR_PXI:
   case AArch64::STR_ZXI:
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STR_ZZZXI:
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STRBBui:
   case AArch64::STRBui:
   case AArch64::STRDui:
@@ -4318,7 +4322,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     break;
   // SVE
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 4);
     MinOffset = -256;
@@ -4332,7 +4338,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MaxOffset = 253;
     break;
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 2);
     MinOffset = -256;
@@ -5559,8 +5567,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZXI;
@@ -5584,8 +5596,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZZZXI;
@@ -5736,8 +5752,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZXI;
@@ -5761,8 +5781,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZZZXI;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddc685fae5e9..bbcd6824473c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -2148,7 +2148,7 @@ let Predicates = [HasPAuth] in {
                             i64imm:$Disc, GPR64:$AddrDisc),
                        [], "$AuthVal = $Val">, Sched<[WriteI, ReadI]> {
     let isCodeGenOnly = 1;
-    let hasSideEffects = 0;
+    let hasSideEffects = 1;
     let mayStore = 0;
     let mayLoad = 0;
     let Size = 32;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 5379305bc7a7..ec7e5201a6e7 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -21,7 +21,11 @@ def TuneA320 : SubtargetFeature<"a320", "ARMProcFamily", "CortexA320",
                                    "Cortex-A320 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   // Downstream issue: #482 (Use
+                                   // FeatureUseFixedOverScalableIfEqualCost
+                                   // for A320)
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors", [
@@ -41,14 +45,22 @@ def TuneA510    : SubtargetFeature<"a510", "ARMProcFamily", "CortexA510",
                                    "Cortex-A510 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler
+                                   FeaturePostRAScheduler,
+                                   // Downstream issue: #482 (Use
+                                   // FeatureUseFixedOverScalableIfEqualCost
+                                   // for A320)
+                                   FeatureUseFixedOverScalableIfEqualCost
                                    ]>;
 
 def TuneA520    : SubtargetFeature<"a520", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520 ARM processors", [
                                    FeatureFuseAES,
                                    FeatureFuseAdrpAdd,
-                                   FeaturePostRAScheduler]>;
+                                   FeaturePostRAScheduler,
+                                   // Downstream issue: #482 (Use
+                                   // FeatureUseFixedOverScalableIfEqualCost
+                                   // for A320)
+                                   FeatureUseFixedOverScalableIfEqualCost]>;
 
 def TuneA520AE  : SubtargetFeature<"a520ae", "ARMProcFamily", "CortexA520",
                                    "Cortex-A520AE ARM processors", [
@@ -750,7 +762,8 @@ def ProcessorFeatures {
                                  FeatureSB, FeaturePAuth, FeatureSSBS, FeatureSVE, FeatureSVE2,
                                  FeatureComplxNum, FeatureCRC, FeatureDotProd,
                                  FeatureFPARMv8,FeatureFullFP16, FeatureJS, FeatureLSE,
-                                 FeatureUseFixedOverScalableIfEqualCost,
+                                 // Downstream issue: #482 (Use
+                                 // FeatureUseFixedOverScalableIfEqualCost for A320)
                                  FeatureRAS, FeatureRCPC, FeatureRDM, FeatureFPAC];
   list<SubtargetFeature> A520 = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
@@ -760,7 +773,8 @@ def ProcessorFeatures {
                                  FeatureSVE, FeatureSVE2, FeatureBF16, FeatureComplxNum, FeatureCRC,
                                  FeatureFPARMv8, FeatureFullFP16, FeatureMatMulInt8, FeatureJS,
                                  FeatureNEON, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM,
-                                 FeatureUseFixedOverScalableIfEqualCost,
+                                 // Downstream issue: #482 (Use
+                                 // FeatureUseFixedOverScalableIfEqualCost for A320)
                                  FeatureDotProd, FeatureFPAC];
   list<SubtargetFeature> A520AE = [HasV9_2aOps, FeaturePerfMon, FeatureAM,
                                  FeatureMTE, FeatureETE, FeatureSVEBitPerm,
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eddb96979f7b..0c4b4f4c3ed8 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
   // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
   // AArch64ExpandPseudoInsts.
   let mayLoad = 1, hasSideEffects = 0 in {
-    def LDR_ZZXI   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def LDR_ZZXI   : Pseudo<(outs   ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_PPXI   : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_PPXI   : Pseudo<(outs   PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
   let mayStore = 1, hasSideEffects = 0 in {
-    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_PPXI   : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_PPXI   : Pseudo<(outs), (ins   PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
 
   let AddedComplexity = 1 in {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 095682334679..2409cc862f21 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -270,6 +270,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case NeoverseV2:
   case NeoverseV3:
+    CacheLineSize = 64;
     EpilogueVectorizationMinVF = 8;
     MaxInterleaveFactor = 4;
     ScatterOverhead = 13;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 07baf29ce701..193be62b28ad 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3724,7 +3724,7 @@ InstructionCost AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
 
 InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-    bool HasRealUse, const Instruction *I, Value *Scalar,
+    const Instruction *I, Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
   assert(Val->isVectorTy() && "This must be a vector type");
 
@@ -3744,12 +3744,10 @@ InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
     }
 
     // The element at index zero is already inside the vector.
-    // - For a physical (HasRealUse==true) insert-element or extract-element
+    // - For a insert-element or extract-element
     // instruction that extracts integers, an explicit FPR -> GPR move is
     // needed. So it has non-zero cost.
-    // - For the rest of cases (virtual instruction or element type is float),
-    // consider the instruction free.
-    if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
+    if (Index == 0 && !Val->getScalarType()->isIntegerTy())
       return 0;
 
     // This is recognising a LD1 single-element structure to one lane of one
@@ -3899,25 +3897,28 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
                                                    unsigned Index,
                                                    const Value *Op0,
                                                    const Value *Op1) const {
-  bool HasRealUse =
-      Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, HasRealUse);
+  // Treat insert at lane 0 into a poison vector as having zero cost. This
+  // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
+  // single dup) are treated as cheap.
+  if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
+      isa<PoisonValue>(Op0))
+    return 0;
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(
     unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
     Value *Scalar,
     ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) const {
-  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr,
-                                  Scalar, ScalarUserAndIdx);
+  return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
+                                  ScalarUserAndIdx);
 }
 
 InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I,
                                                    Type *Val,
                                                    TTI::TargetCostKind CostKind,
                                                    unsigned Index) const {
-  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index,
-                                  true /* HasRealUse */, &I);
+  return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I);
 }
 
 InstructionCost AArch64TTIImpl::getScalarizationOverhead(
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index ff0ab68a16a8..b27eb2ef7a39 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -65,16 +65,14 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
 
   // A helper function called by 'getVectorInstrCost'.
   //
-  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost'; 'HasRealUse'
-  // indicates whether the vector instruction is available in the input IR or
-  // just imaginary in vectorizer passes.
-  /// \param ScalarUserAndIdx encodes the information about extracts from a
+  // 'Val' and 'Index' are forwarded from 'getVectorInstrCost';
+  // \param ScalarUserAndIdx encodes the information about extracts from a
   /// vector with 'Scalar' being the value being extracted,'User' being the user
   /// of the extract(nullptr if user is not known before vectorization) and
   /// 'Idx' being the extract lane.
   InstructionCost getVectorInstrCostHelper(
       unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
-      bool HasRealUse, const Instruction *I = nullptr, Value *Scalar = nullptr,
+      const Instruction *I = nullptr, Value *Scalar = nullptr,
       ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx = {}) const;
 
 public:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 3414fe758eff..7b93382d1281 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -392,8 +392,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // Library functions.  These default to Expand, but we have instructions
   // for them.
   setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
-                      ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
-                     MVT::f32, Legal);
+                      ISD::FROUNDEVEN, ISD::FTRUNC},
+                     {MVT::f16, MVT::f32}, Legal);
+  setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
 
   setOperationAction(ISD::FLOG2, MVT::f32, Custom);
   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
@@ -413,9 +414,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
 
-  if (Subtarget->has16BitInsts())
+  if (Subtarget->has16BitInsts()) {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
-  else {
+    setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
+  } else {
     setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
     setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
   }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index fb72bab03e75..9593038ff2c9 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -370,6 +370,11 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
       setOperationAction(ISD::FROUND, VT, Legal);
+      setOperationAction(ISD::FROUNDEVEN, VT, Legal);
+      setOperationAction(ISD::FRINT, VT, Legal);
+      setOperationAction(ISD::FTRUNC, VT, Legal);
+      setOperationAction(ISD::FFLOOR, VT, Legal);
+      setOperationAction(ISD::FCEIL, VT, Legal);
       setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
@@ -1507,6 +1512,12 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::FLOG2, MVT::f16, Promote);
 
     setOperationAction(ISD::FROUND, MVT::f16, Legal);
+    setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
+    setOperationAction(ISD::FRINT, MVT::f16, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
+    setOperationAction(ISD::FCEIL, MVT::f16, Legal);
   }
 
   if (Subtarget->hasNEON()) {
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 3955f2a252e7..25ad9eccbce5 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -669,7 +669,7 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       default: {
         // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows
         // us to  fold the constant into the cmp instruction.
-        RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+        RHS = DAG.getSignedConstant(C->getSExtValue() + 1, DL, VT);
         CC = ISD::SETGE;
         break;
       }
@@ -713,7 +713,10 @@ SDValue AVRTargetLowering::getAVRCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
     // fold the constant into the cmp instruction.
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
-      RHS = DAG.getConstant(C->getSExtValue() + 1, DL, VT);
+      // Doing a "icmp ugt i16 65535, %0" comparison should have been converted
+      // already to something else. Assert to make sure this assumption holds.
+      assert((!C->isAllOnes()) && "integer overflow in comparison transform");
+      RHS = DAG.getConstant(C->getZExtValue() + 1, DL, VT);
       CC = ISD::SETUGE;
       break;
     }
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 5bd31707acb6..bcddb540d35d 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -43,12 +43,12 @@ namespace {
 class HexagonDisassembler : public MCDisassembler {
 public:
   std::unique_ptr<MCInstrInfo const> const MCII;
-  std::unique_ptr<MCInst *> CurrentBundle;
+  mutable std::unique_ptr<MCInst> CurrentBundle;
   mutable MCInst const *CurrentExtender;
 
   HexagonDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                       MCInstrInfo const *MCII)
-      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(new MCInst *),
+      : MCDisassembler(STI, Ctx), MCII(MCII), CurrentBundle(nullptr),
         CurrentExtender(nullptr) {}
 
   DecodeStatus getSingleInstruction(MCInst &Instr, MCInst &MCB,
@@ -57,7 +57,23 @@ class HexagonDisassembler : public MCDisassembler {
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &CStream) const override;
+
+  DecodeStatus getInstructionBundle(MCInst &Instr, uint64_t &Size,
+                                    ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                    raw_ostream &CStream) const override;
+
   void remapInstruction(MCInst &Instr) const;
+
+private:
+  bool makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                  uint64_t &BytesToSkip, raw_ostream &CS) const;
+
+  void resetBundle() const {
+    CurrentBundle.reset();
+    CurrentInstruction = nullptr;
+  }
+
+  mutable MCOperand *CurrentInstruction = nullptr;
 };
 
 static uint64_t fullValue(HexagonDisassembler const &Disassembler, MCInst &MI,
@@ -171,43 +187,88 @@ LLVMInitializeHexagonDisassembler() {
                                          createHexagonDisassembler);
 }
 
-DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 ArrayRef<uint8_t> Bytes,
-                                                 uint64_t Address,
-                                                 raw_ostream &CS) const {
-  CommentStream = &CS;
-
-  DecodeStatus Result = DecodeStatus::Success;
+bool HexagonDisassembler::makeBundle(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                     uint64_t &BytesToSkip,
+                                     raw_ostream &CS) const {
   bool Complete = false;
-  Size = 0;
+  DecodeStatus Result = DecodeStatus::Success;
 
-  *CurrentBundle = &MI;
-  MI.setOpcode(Hexagon::BUNDLE);
-  MI.addOperand(MCOperand::createImm(0));
+  CurrentBundle.reset(new MCInst);
+  CurrentBundle->setOpcode(Hexagon::BUNDLE);
+  CurrentBundle->addOperand(MCOperand::createImm(0));
   while (Result == Success && !Complete) {
     if (Bytes.size() < HEXAGON_INSTR_SIZE)
-      return MCDisassembler::Fail;
+      return false;
     MCInst *Inst = getContext().createMCInst();
-    Result = getSingleInstruction(*Inst, MI, Bytes, Address, CS, Complete);
-    MI.addOperand(MCOperand::createInst(Inst));
-    Size += HEXAGON_INSTR_SIZE;
+    Result = getSingleInstruction(*Inst, *CurrentBundle, Bytes, Address, CS,
+                                  Complete);
+    CurrentBundle->addOperand(MCOperand::createInst(Inst));
+    BytesToSkip += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
   if (Result == MCDisassembler::Fail)
-    return Result;
-  if (Size > HEXAGON_MAX_PACKET_SIZE)
-    return MCDisassembler::Fail;
+    return false;
+  if (BytesToSkip > HEXAGON_MAX_PACKET_SIZE)
+    return false;
 
   const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
   const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI;
-  HexagonMCChecker Checker(getContext(), *MCII, STI_, MI,
+  HexagonMCChecker Checker(getContext(), *MCII, STI_, *CurrentBundle,
                            *getContext().getRegisterInfo(), false);
   if (!Checker.check())
-    return MCDisassembler::Fail;
-  remapInstruction(MI);
+    return false;
+  remapInstruction(*CurrentBundle);
+  return true;
+}
+
+DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 ArrayRef<uint8_t> Bytes,
+                                                 uint64_t Address,
+                                                 raw_ostream &CS) const {
+  CommentStream = &CS;
+
+  Size = 0;
+  uint64_t BytesToSkip = 0;
+
+  if (!CurrentBundle) {
+    if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+      Size = BytesToSkip;
+      resetBundle();
+      return MCDisassembler::Fail;
+    }
+    CurrentInstruction = (CurrentBundle->begin() + 1);
+  }
+
+  MI = *(CurrentInstruction->getInst());
+  Size = HEXAGON_INSTR_SIZE;
+  if (++CurrentInstruction == CurrentBundle->end())
+    resetBundle();
   return MCDisassembler::Success;
 }
 
+DecodeStatus HexagonDisassembler::getInstructionBundle(MCInst &MI,
+                                                       uint64_t &Size,
+                                                       ArrayRef<uint8_t> Bytes,
+                                                       uint64_t Address,
+                                                       raw_ostream &CS) const {
+  CommentStream = &CS;
+  Size = 0;
+  uint64_t BytesToSkip = 0;
+  assert(!CurrentBundle);
+
+  if (!makeBundle(Bytes, Address, BytesToSkip, CS)) {
+    Size = BytesToSkip;
+    resetBundle();
+    return MCDisassembler::Fail;
+  }
+
+  MI = *CurrentBundle;
+  Size = HEXAGON_INSTR_SIZE * HexagonMCInstrInfo::bundleSize(MI);
+  resetBundle();
+
+  return Success;
+}
+
 void HexagonDisassembler::remapInstruction(MCInst &Instr) const {
   for (auto I: HexagonMCInstrInfo::bundleInstructions(Instr)) {
     auto &MI = const_cast<MCInst &>(*I.getInst());
@@ -465,6 +526,9 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     MI.insert(MI.begin() + 1,
               MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
     break;
+  case Hexagon::Y4_crswap10:
+    MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0));
+    break;
   default:
     break;
   }
@@ -482,7 +546,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     unsigned Offset = 1;
     bool Vector = HexagonMCInstrInfo::isVector(*MCII, MI);
     bool PrevVector = false;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(*CurrentBundle);
     auto i = Instructions.end() - 1;
     for (auto n = Instructions.begin() - 1;; --i, ++Offset) {
       if (i == n)
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 53943de3bc59..e285e0454369 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -1640,6 +1640,15 @@ bool HexagonDAGToDAGISel::DetectUseSxtw(SDValue &N, SDValue &R) {
       R = N;
       break;
     }
+    case ISD::AssertSext: {
+      EVT T = cast<VTSDNode>(N.getOperand(1))->getVT();
+      if (T.getSizeInBits() == 32)
+        R = N.getOperand(0);
+      else
+        return false;
+      break;
+    }
+
     default:
       return false;
   }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 9030e43b7149..f83e06cd3d93 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -33,30 +33,18 @@ void HexagonInstPrinter::printRegName(raw_ostream &O, MCRegister Reg) {
 void HexagonInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                    StringRef Annot, const MCSubtargetInfo &STI,
                                    raw_ostream &OS) {
-  assert(HexagonMCInstrInfo::isBundle(*MI));
-  assert(HexagonMCInstrInfo::bundleSize(*MI) <= HEXAGON_PACKET_SIZE);
-  assert(HexagonMCInstrInfo::bundleSize(*MI) > 0);
-  HasExtender = false;
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MI)) {
-    MCInst const &MCI = *I.getInst();
-    if (HexagonMCInstrInfo::isDuplex(MII, MCI)) {
-      printInstruction(MCI.getOperand(1).getInst(), Address, OS);
-      OS << '\v';
-      HasExtender = false;
-      printInstruction(MCI.getOperand(0).getInst(), Address, OS);
-    } else
-      printInstruction(&MCI, Address, OS);
-    HasExtender = HexagonMCInstrInfo::isImmext(MCI);
-    OS << "\n";
-  }
-
-  bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(*MI);
-  bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(*MI);
-  if (IsLoop0) {
-    OS << (IsLoop1 ? " :endloop01" : " :endloop0");
-  } else if (IsLoop1) {
-    OS << " :endloop1";
+  if (HexagonMCInstrInfo::isDuplex(MII, *MI)) {
+    printInstruction(MI->getOperand(1).getInst(), Address, OS);
+    OS << '\v';
+    HasExtender = false;
+    printInstruction(MI->getOperand(0).getInst(), Address, OS);
+  } else {
+    printInstruction(MI, Address, OS);
   }
+  HasExtender = HexagonMCInstrInfo::isImmext(*MI);
+  if ((MI->getOpcode() & HexagonII::INST_PARSE_MASK) ==
+      HexagonII::INST_PARSE_PACKET_END)
+    HasExtender = false;
 }
 
 void HexagonInstPrinter::printOperand(MCInst const *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 980df819b2c2..bfea50e2d6dc 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -252,8 +252,21 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     std::string Buffer;
     {
       raw_string_ostream TempStream(Buffer);
-      InstPrinter.printInst(&Inst, Address, "", STI, TempStream);
+      for (auto &I : HexagonMCInstrInfo::bundleInstructions(Inst)) {
+        InstPrinter.printInst(I.getInst(), Address, "", STI, TempStream);
+        TempStream << "\n";
+      }
+    }
+
+    std::string LoopString = "";
+    bool IsLoop0 = HexagonMCInstrInfo::isInnerLoop(Inst);
+    bool IsLoop1 = HexagonMCInstrInfo::isOuterLoop(Inst);
+    if (IsLoop0) {
+      LoopString += (IsLoop1 ? " :endloop01" : " :endloop0");
+    } else if (IsLoop1) {
+      LoopString += " :endloop1";
     }
+
     StringRef Contents(Buffer);
     auto PacketBundle = Contents.rsplit('\n');
     auto HeadTail = PacketBundle.first.split('\n');
@@ -275,9 +288,9 @@ class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
     }
 
     if (HexagonMCInstrInfo::isMemReorderDisabled(Inst))
-      OS << "\n\t} :mem_noshuf" << PacketBundle.second;
+      OS << "\n\t} :mem_noshuf" << LoopString;
     else
-      OS << "\t}" << PacketBundle.second;
+      OS << "\t}" << LoopString;
   }
 
   void finish() override { finishAttributeSection(); }
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index ac5e7f3891c7..1493bf4cba69 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -158,7 +158,12 @@ void LoongArchFrameLowering::processFunctionBeforeFrameFinalized(
   // estimateStackSize has been observed to under-estimate the final stack
   // size, so give ourselves wiggle-room by checking for stack size
   // representable an 11-bit signed field rather than 12-bits.
-  if (!isInt<11>(MFI.estimateStackSize(MF)))
+  // For [x]vstelm.{b/h/w/d} memory instructions with 8 imm offset, 7-bit
+  // signed field is fine.
+  unsigned EstimateStackSize = MFI.estimateStackSize(MF);
+  if (!isInt<11>(EstimateStackSize) ||
+      (MF.getSubtarget<LoongArchSubtarget>().hasExtLSX() &&
+       !isInt<7>(EstimateStackSize)))
     ScavSlotsNum = std::max(ScavSlotsNum, 1u);
 
   // For CFR spill.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index c47987fbf683..d6adcf37f06f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2430,11 +2430,14 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
   }
 
   // make sure that this load is valid and only has one user.
-  if (!IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
+  if (!IsIdeneity || !IdentitySrc || !BVOp->isOnlyUserOf(IdentitySrc.getNode()))
     return SDValue();
 
-  if (IsIdeneity) {
-    auto *LN = cast<LoadSDNode>(IdentitySrc);
+  auto *LN = cast<LoadSDNode>(IdentitySrc);
+  auto ExtType = LN->getExtensionType();
+
+  if ((ExtType == ISD::EXTLOAD || ExtType == ISD::NON_EXTLOAD) &&
+      VT.getScalarSizeInBits() == LN->getMemoryVT().getScalarSizeInBits()) {
     SDVTList Tys =
         LN->isIndexed()
             ? DAG.getVTList(VT, LN->getBasePtr().getValueType(), MVT::Other)
@@ -4563,6 +4566,80 @@ static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
   llvm_unreachable("Unexpected node type for vXi1 sign extension");
 }
 
+static SDValue
+performSETCC_BITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            const LoongArchSubtarget &Subtarget) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  if (Src.getOpcode() != ISD::SETCC || !Src.hasOneUse())
+    return SDValue();
+
+  bool UseLASX;
+  unsigned Opc = ISD::DELETED_NODE;
+  EVT CmpVT = Src.getOperand(0).getValueType();
+  EVT EltVT = CmpVT.getVectorElementType();
+
+  if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() == 128)
+    UseLASX = false;
+  else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
+           CmpVT.getSizeInBits() == 256)
+    UseLASX = true;
+  else
+    return SDValue();
+
+  SDValue SrcN1 = Src.getOperand(1);
+  switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
+  default:
+    break;
+  case ISD::SETEQ:
+    // x == 0 => not (vmsknez.b x)
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
+    break;
+  case ISD::SETGT:
+    // x > -1 => vmskgez.b x
+    if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+    break;
+  case ISD::SETGE:
+    // x >= 0 => vmskgez.b x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
+    break;
+  case ISD::SETLT:
+    // x < 0 => vmskltz.{b,h,w,d} x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+         EltVT == MVT::i64))
+      Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    break;
+  case ISD::SETLE:
+    // x <= -1 => vmskltz.{b,h,w,d} x
+    if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
+         EltVT == MVT::i64))
+      Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
+    break;
+  case ISD::SETNE:
+    // x != 0 => vmsknez.b x
+    if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
+      Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
+    break;
+  }
+
+  if (Opc == ISD::DELETED_NODE)
+    return SDValue();
+
+  SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src.getOperand(0));
+  EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
+  V = DAG.getZExtOrTrunc(V, DL, T);
+  return DAG.getBitcast(VT, V);
+}
+
 static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const LoongArchSubtarget &Subtarget) {
@@ -4577,110 +4654,63 @@ static SDValue performBITCASTCombine(SDNode *N, SelectionDAG &DAG,
   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
     return SDValue();
 
-  unsigned Opc = ISD::DELETED_NODE;
   // Combine SETCC and BITCAST into [X]VMSK{LT,GE,NE} when possible
+  SDValue Res = performSETCC_BITCASTCombine(N, DAG, DCI, Subtarget);
+  if (Res)
+    return Res;
+
+  // Generate vXi1 using [X]VMSKLTZ
+  MVT SExtVT;
+  unsigned Opc;
+  bool UseLASX = false;
+  bool PropagateSExt = false;
+
   if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse()) {
-    bool UseLASX;
     EVT CmpVT = Src.getOperand(0).getValueType();
-    EVT EltVT = CmpVT.getVectorElementType();
-
-    if (Subtarget.hasExtLSX() && CmpVT.getSizeInBits() <= 128)
-      UseLASX = false;
-    else if (Subtarget.has32S() && Subtarget.hasExtLASX() &&
-             CmpVT.getSizeInBits() <= 256)
-      UseLASX = true;
-    else
+    if (CmpVT.getSizeInBits() > 256)
       return SDValue();
-
-    SDValue SrcN1 = Src.getOperand(1);
-    switch (cast<CondCodeSDNode>(Src.getOperand(2))->get()) {
-    default:
-      break;
-    case ISD::SETEQ:
-      // x == 0 => not (vmsknez.b x)
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKEQZ : LoongArchISD::VMSKEQZ;
-      break;
-    case ISD::SETGT:
-      // x > -1 => vmskgez.b x
-      if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
-      break;
-    case ISD::SETGE:
-      // x >= 0 => vmskgez.b x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKGEZ : LoongArchISD::VMSKGEZ;
-      break;
-    case ISD::SETLT:
-      // x < 0 => vmskltz.{b,h,w,d} x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) &&
-          (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
-           EltVT == MVT::i64))
-        Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-      break;
-    case ISD::SETLE:
-      // x <= -1 => vmskltz.{b,h,w,d} x
-      if (ISD::isBuildVectorAllOnes(SrcN1.getNode()) &&
-          (EltVT == MVT::i8 || EltVT == MVT::i16 || EltVT == MVT::i32 ||
-           EltVT == MVT::i64))
-        Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-      break;
-    case ISD::SETNE:
-      // x != 0 => vmsknez.b x
-      if (ISD::isBuildVectorAllZeros(SrcN1.getNode()) && EltVT == MVT::i8)
-        Opc = UseLASX ? LoongArchISD::XVMSKNEZ : LoongArchISD::VMSKNEZ;
-      break;
-    }
   }
 
-  // Generate vXi1 using [X]VMSKLTZ
-  if (Opc == ISD::DELETED_NODE) {
-    MVT SExtVT;
-    bool UseLASX = false;
-    bool PropagateSExt = false;
-    switch (SrcVT.getSimpleVT().SimpleTy) {
-    default:
-      return SDValue();
-    case MVT::v2i1:
-      SExtVT = MVT::v2i64;
-      break;
-    case MVT::v4i1:
-      SExtVT = MVT::v4i32;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v4i64;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v8i1:
-      SExtVT = MVT::v8i16;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v8i32;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v16i1:
-      SExtVT = MVT::v16i8;
-      if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
-        SExtVT = MVT::v16i16;
-        UseLASX = true;
-        PropagateSExt = true;
-      }
-      break;
-    case MVT::v32i1:
-      SExtVT = MVT::v32i8;
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    return SDValue();
+  case MVT::v2i1:
+    SExtVT = MVT::v2i64;
+    break;
+  case MVT::v4i1:
+    SExtVT = MVT::v4i32;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v4i64;
       UseLASX = true;
-      break;
-    };
-    if (UseLASX && !Subtarget.has32S() && !Subtarget.hasExtLASX())
-      return SDValue();
-    Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
-                        : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
-    Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
-  } else {
-    Src = Src.getOperand(0);
-  }
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v8i1:
+    SExtVT = MVT::v8i16;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v8i32;
+      UseLASX = true;
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v16i1:
+    SExtVT = MVT::v16i8;
+    if (Subtarget.hasExtLASX() && checkBitcastSrcVectorSize(Src, 256, 0)) {
+      SExtVT = MVT::v16i16;
+      UseLASX = true;
+      PropagateSExt = true;
+    }
+    break;
+  case MVT::v32i1:
+    SExtVT = MVT::v32i8;
+    UseLASX = true;
+    break;
+  };
+  if (UseLASX && !(Subtarget.has32S() && Subtarget.hasExtLASX()))
+    return SDValue();
+  Src = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL)
+                      : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src);
+  Opc = UseLASX ? LoongArchISD::XVMSKLTZ : LoongArchISD::VMSKLTZ;
 
   SDValue V = DAG.getNode(Opc, DL, MVT::i64, Src);
   EVT T = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements());
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
index 03ce004ed33a..7cefb3f8119b 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchBaseInfo.cpp
@@ -52,6 +52,9 @@ static ABI getTripleABI(const Triple &TT) {
   bool Is64Bit = TT.isArch64Bit();
   ABI TripleABI;
   switch (TT.getEnvironment()) {
+  case llvm::Triple::EnvironmentType::UnknownEnvironment:
+    TripleABI = ABI_Unknown;
+    break;
   case llvm::Triple::EnvironmentType::GNUSF:
   case llvm::Triple::EnvironmentType::MuslSF:
     TripleABI = Is64Bit ? ABI_LP64S : ABI_ILP32S;
@@ -96,7 +99,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
 
   // 1. If the '-target-abi' is valid, use it.
   if (IsABIValidForFeature(ArgProvidedABI)) {
-    if (TT.hasEnvironment() && ArgProvidedABI != TripleABI)
+    if (IsABIValidForFeature(TripleABI) && ArgProvidedABI != TripleABI)
       errs()
           << "warning: triple-implied ABI conflicts with provided target-abi '"
           << ABIName << "', using target-abi\n";
@@ -164,10 +167,7 @@ ABI computeTargetABI(const Triple &TT, const FeatureBitset &FeatureBits,
       return Is64Bit ? ABI_LP64F : ABI_ILP32F;
     return Is64Bit ? ABI_LP64S : ABI_ILP32S;
   };
-  if (ABIName.empty())
-    errs() << "warning: the triple-implied ABI is invalid, ignoring and using "
-              "feature-implied ABI\n";
-  else
+  if (!ABIName.empty())
     errs() << "warning: both target-abi and the triple-implied ABI are "
               "invalid, ignoring and using feature-implied ABI\n";
   return checkABIStandardized(GetFeatureABI());
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 87e06a6d3c08..2903ff75475c 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -747,14 +747,18 @@ void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
     if (FS.empty() && M.size() && F->hasFnAttribute("target-features"))
       FS = F->getFnAttribute("target-features").getValueAsString();
 
+    std::string strFS = FS.str();
+    if (M.size() && F->getFnAttribute("use-soft-float").getValueAsBool())
+      strFS += strFS.empty() ? "+soft-float" : ",+soft-float";
+
     // Compute MIPS architecture attributes based on the default subtarget
     // that we'd have constructed.
     // FIXME: For ifunc related functions we could iterate over and look
     // for a feature string that doesn't match the default one.
     StringRef CPU = MIPS_MC::selectMipsCPU(TT, TM.getTargetCPU());
     const MipsTargetMachine &MTM = static_cast<const MipsTargetMachine &>(TM);
-    const MipsSubtarget STI(TT, CPU, FS, MTM.isLittleEndian(), MTM,
-                            std::nullopt);
+    const MipsSubtarget STI(TT, CPU, StringRef(strFS), MTM.isLittleEndian(),
+                            MTM, std::nullopt);
 
     bool IsABICalls = STI.isABICalls();
     const MipsABIInfo &ABI = MTM.getABI();
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 0e581a7a1650..ec6b38215166 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -522,9 +522,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
-
   setTargetDAGCombine({ISD::SDIVREM, ISD::UDIVREM, ISD::SELECT, ISD::AND,
                        ISD::OR, ISD::ADD, ISD::SUB, ISD::AssertZext, ISD::SHL,
                        ISD::SIGN_EXTEND});
@@ -1360,8 +1357,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
   case ISD::READCYCLECOUNTER:
     return lowerREADCYCLECOUNTER(Op, DAG);
-  case ISD::ConstantFP:
-    return lowerConstantFP(Op, DAG);
   }
   return SDValue();
 }
@@ -3019,30 +3014,6 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
   return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
 }
 
-SDValue MipsTargetLowering::lowerConstantFP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getSimpleValueType();
-  SDNode *N = Op.getNode();
-  ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(N);
-
-  if (!CFP->isNaN() || Subtarget.isNaN2008()) {
-    return SDValue();
-  }
-
-  APFloat NaNValue = CFP->getValueAPF();
-  auto &Sem = NaNValue.getSemantics();
-
-  // The MSB of the mantissa should be zero for QNaNs in the MIPS legacy NaN
-  // encodings, and one for sNaNs. Check every NaN constants and make sure
-  // they are correctly encoded for legacy encodings.
-  if (!NaNValue.isSignaling()) {
-    APFloat RealQNaN = NaNValue.getSNaN(Sem);
-    return DAG.getConstantFP(RealQNaN, DL, VT);
-  }
-  return SDValue();
-}
-
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 31ac5d4c185b..c65c76ccffc7 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -592,7 +592,6 @@ class TargetRegisterClass;
     SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerConstantFP(SDValue Op, SelectionDAG &DAG) const;
 
     /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 14f05250ad6b..5d496f69172c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5008,12 +5008,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
           return !U.getUser()->use_empty();
         }
 
-        // Handle CopyToReg nodes that will become dead after our replacement
-        if (U.getUser()->getOpcode() == ISD::CopyToReg) {
-          DeadCopyToRegs.push_back(U.getUser());
-          return true;
-        }
-
         // Otherwise, this use prevents us from splitting a value.
         return false;
       }))
@@ -5080,10 +5074,6 @@ combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
     Results.push_back(NewLoad.getValue(NewNumOutputs + I));
 
-  // Remove dead CopyToReg nodes by folding them into the chain they reference
-  for (SDNode *CTR : DeadCopyToRegs)
-    DCI.CombineTo(CTR, CTR->getOperand(0));
-
   return DCI.DAG.getMergeValues(Results, DL);
 }
 
@@ -6418,4 +6408,4 @@ void NVPTXTargetLowering::computeKnownBitsForTargetNode(
   default:
     break;
   }
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 459525ed4ee9..67f59ed507f3 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7296,9 +7296,17 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
       if (!ArgVT.isVector() && !ValVT.isVector() && ArgVT.isInteger() &&
           ValVT.isInteger() &&
           ArgVT.getScalarSizeInBits() < ValVT.getScalarSizeInBits()) {
-        SDValue ArgValueTrunc = DAG.getNode(
-            ISD::TRUNCATE, dl, ArgVT.getSimpleVT() == MVT::i1 ? MVT::i8 : ArgVT,
-            ArgValue);
+        // It is possible to have either real integer values
+        // or integers that were not originally integers.
+        // In the latter case, these could have came from structs,
+        // and these integers would not have an extend on the parameter.
+        // Since these types of integers do not have an extend specified
+        // in the first place, the type of extend that we do should not matter.
+        EVT TruncatedArgVT = ArgVT.isSimple() && ArgVT.getSimpleVT() == MVT::i1
+                                 ? MVT::i8
+                                 : ArgVT;
+        SDValue ArgValueTrunc =
+            DAG.getNode(ISD::TRUNCATE, dl, TruncatedArgVT, ArgValue);
         SDValue ArgValueExt =
             ArgSignExt ? DAG.getSExtOrTrunc(ArgValueTrunc, dl, ValVT)
                        : DAG.getZExtOrTrunc(ArgValueTrunc, dl, ValVT);
@@ -9586,12 +9594,14 @@ static bool isValidSplatLoad(const PPCSubtarget &Subtarget, const SDValue &Op,
   return false;
 }
 
-bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) {
+bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN,
+                     bool IsLittleEndian) {
   assert(BVN.getNumOperands() > 0 && "Unexpected 0-size build vector");
 
   BitMask.clearAllBits();
   EVT VT = BVN.getValueType(0);
-  APInt ConstValue(VT.getSizeInBits(), 0);
+  unsigned VTSize = VT.getSizeInBits();
+  APInt ConstValue(VTSize, 0);
 
   unsigned EltWidth = VT.getScalarSizeInBits();
 
@@ -9601,8 +9611,10 @@ bool isValidMtVsrBmi(APInt &BitMask, BuildVectorSDNode &BVN) {
 
     if (!CN)
       return false;
-
-    ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth), BitPos);
+    // The elements in a vector register are ordered in reverse byte order
+    // between little-endian and big-endian modes.
+    ConstValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltWidth),
+                          IsLittleEndian ? BitPos : VTSize - EltWidth - BitPos);
     BitPos += EltWidth;
   }
 
@@ -9633,7 +9645,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // we do not convert it to MTVSRBMI.
     // The xxleqv instruction sets a vector with all ones.
     // The xxlxor instruction sets a vector with all zeros.
-    if (isValidMtVsrBmi(BitMask, *BVN) && BitMask != 0 && BitMask != 0xffff) {
+    if (isValidMtVsrBmi(BitMask, *BVN, Subtarget.isLittleEndian()) &&
+        BitMask != 0 && BitMask != 0xffff) {
       SDValue SDConstant = DAG.getTargetConstant(BitMask, dl, MVT::i32);
       MachineSDNode *MSDNode =
           DAG.getMachineNode(PPC::MTVSRBMI, dl, MVT::v16i8, SDConstant);
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index 99ef89a7fdc0..dbbf06f47153 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -3252,7 +3252,8 @@ def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
 
 // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
 // This uses two output registers, the first as the real output, the second as a
-// temporary register, used internally in code generation.
+// temporary register, used internally in code generation. A "bl" also clobbers LR.
+let Defs = [LR] in
 def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
                 []>, NoEncode<"$rT">;
 
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 76dca4794e05..f1230407b164 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -1102,13 +1102,20 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
     SpillsKnownBit = true;
     break;
   default:
+    // When spilling a CR bit, the super register may not be explicitly defined
+    // (i.e. it can be defined by a CR-logical that only defines the subreg) so
+    // we state that the CR field is undef. Also, in order to preserve the kill
+    // flag on the CR bit, we add it as an implicit use.
+
     // On Power10, we can use SETNBC to spill all CR bits. SETNBC will set all
     // bits (specifically, it produces a -1 if the CR bit is set). Ultimately,
     // the bit that is of importance to us is bit 32 (bit 0 of a 32-bit
     // register), and SETNBC will set this.
     if (Subtarget.isISA3_1()) {
       BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETNBC8 : PPC::SETNBC), Reg)
-          .addReg(SrcReg, RegState::Undef);
+          .addReg(SrcReg, RegState::Undef)
+          .addReg(SrcReg, RegState::Implicit |
+                              getKillRegState(MI.getOperand(0).isKill()));
       break;
     }
 
@@ -1122,16 +1129,14 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
           SrcReg == PPC::CR4LT || SrcReg == PPC::CR5LT ||
           SrcReg == PPC::CR6LT || SrcReg == PPC::CR7LT) {
         BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::SETB8 : PPC::SETB), Reg)
-          .addReg(getCRFromCRBit(SrcReg), RegState::Undef);
+            .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit |
+                                getKillRegState(MI.getOperand(0).isKill()));
         break;
       }
     }
 
     // We need to move the CR field that contains the CR bit we are spilling.
-    // The super register may not be explicitly defined (i.e. it can be defined
-    // by a CR-logical that only defines the subreg) so we state that the CR
-    // field is undef. Also, in order to preserve the kill flag on the CR bit,
-    // we add it as an implicit use.
     BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
       .addReg(getCRFromCRBit(SrcReg), RegState::Undef)
       .addReg(SrcReg,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index e42d6c539a34..df3028e05139 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -792,6 +792,23 @@ void RISCVAsmBackend::maybeAddVendorReloc(const MCFragment &F,
   Asm->getWriter().recordRelocation(F, VendorFixup, VendorTarget, VendorValue);
 }
 
+static bool relaxableFixupNeedsRelocation(const MCFixupKind Kind) {
+  // Some Fixups are marked as LinkerRelaxable by
+  // `RISCVMCCodeEmitter::getImmOpValue` only because they may be
+  // (assembly-)relaxed into a linker-relaxable instruction. This function
+  // should return `false` for those fixups so they do not get a `R_RISCV_RELAX`
+  // relocation emitted in addition to the relocation.
+  switch (Kind) {
+  default:
+    break;
+  case RISCV::fixup_riscv_rvc_jump:
+  case RISCV::fixup_riscv_rvc_branch:
+  case RISCV::fixup_riscv_jal:
+    return false;
+  }
+  return true;
+}
+
 bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
                                const MCValue &Target, uint64_t &FixedValue,
                                bool IsResolved) {
@@ -834,25 +851,32 @@ bool RISCVAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup,
     return false;
   }
 
-  // If linker relaxation is enabled and supported by the current relocation,
-  // generate a relocation and then append a RELAX.
-  if (Fixup.isLinkerRelaxable())
+  // If linker relaxation is enabled and supported by the current fixup, then we
+  // always want to generate a relocation.
+  bool NeedsRelax = Fixup.isLinkerRelaxable() &&
+                    relaxableFixupNeedsRelocation(Fixup.getKind());
+  if (NeedsRelax)
     IsResolved = false;
+
   if (IsResolved && Fixup.isPCRel())
     IsResolved = isPCRelFixupResolved(Target.getAddSym(), F);
 
   if (!IsResolved) {
-    // Some Fixups require a vendor relocation, record it (directly) before we
+    // Some Fixups require a VENDOR relocation, record it (directly) before we
     // add the relocation.
     maybeAddVendorReloc(F, Fixup);
 
     Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue);
-  }
 
-  if (Fixup.isLinkerRelaxable()) {
-    auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
-    Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr),
-                                      FixedValueA);
+    if (NeedsRelax) {
+      // Some Fixups get a RELAX relocation, record it (directly) after we add
+      // the relocation.
+      MCFixup RelaxFixup =
+          MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_RISCV_RELAX);
+      MCValue RelaxTarget = MCValue::get(nullptr);
+      uint64_t RelaxValue;
+      Asm->getWriter().recordRelocation(F, RelaxFixup, RelaxTarget, RelaxValue);
+    }
   }
 
   return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index cbeabdddb937..717fba68b48e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -576,8 +576,21 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
          "getImmOpValue expects only expressions or immediates");
   const MCExpr *Expr = MO.getExpr();
   MCExpr::ExprKind Kind = Expr->getKind();
-  unsigned FixupKind = RISCV::fixup_riscv_invalid;
+
+  // `RelaxCandidate` must be set to `true` in two cases:
+  // - The fixup's relocation gets a R_RISCV_RELAX relocation
+  // - The underlying instruction may be relaxed to an instruction that gets a
+  //   `R_RISCV_RELAX` relocation.
+  //
+  // The actual emission of `R_RISCV_RELAX` will be handled in
+  // `RISCVAsmBackend::applyFixup`.
   bool RelaxCandidate = false;
+  auto AsmRelaxToLinkerRelaxableWithFeature = [&](unsigned Feature) -> void {
+    if (!STI.hasFeature(RISCV::FeatureExactAssembly) && STI.hasFeature(Feature))
+      RelaxCandidate = true;
+  };
+
+  unsigned FixupKind = RISCV::fixup_riscv_invalid;
   if (Kind == MCExpr::Specifier) {
     const auto *RVExpr = cast<MCSpecifierExpr>(Expr);
     FixupKind = RVExpr->getSpecifier();
@@ -644,18 +657,26 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
     // FIXME: Sub kind binary exprs have chance of underflow.
     if (MIFrm == RISCVII::InstFormatJ) {
       FixupKind = RISCV::fixup_riscv_jal;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
     } else if (MIFrm == RISCVII::InstFormatB) {
       FixupKind = RISCV::fixup_riscv_branch;
+      // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+      // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCJ) {
       FixupKind = RISCV::fixup_riscv_rvc_jump;
+      AsmRelaxToLinkerRelaxableWithFeature(RISCV::FeatureVendorXqcilb);
     } else if (MIFrm == RISCVII::InstFormatCB) {
       FixupKind = RISCV::fixup_riscv_rvc_branch;
+      // This might be assembler relaxed to `b<cc>; jal` but we cannot relax
+      // the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatCI) {
       FixupKind = RISCV::fixup_riscv_rvc_imm;
     } else if (MIFrm == RISCVII::InstFormatI) {
       FixupKind = RISCV::fixup_riscv_12_i;
     } else if (MIFrm == RISCVII::InstFormatQC_EB) {
       FixupKind = RISCV::fixup_riscv_qc_e_branch;
+      // This might be assembler relaxed to `qc.e.b<cc>; jal` but we cannot
+      // relax the `jal` again in the assembler.
     } else if (MIFrm == RISCVII::InstFormatQC_EAI) {
       FixupKind = RISCV::fixup_riscv_qc_e_32;
       RelaxCandidate = true;
@@ -670,9 +691,9 @@ uint64_t RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
   assert(FixupKind != RISCV::fixup_riscv_invalid && "Unhandled expression!");
 
   addFixup(Fixups, 0, Expr, FixupKind);
-  // If linker relaxation is enabled and supported by this relocation, set
-  // a bit so that if fixup is unresolved, a R_RISCV_RELAX relocation will be
-  // appended.
+  // If linker relaxation is enabled and supported by this relocation, set a bit
+  // so that the assembler knows the size of the instruction is not fixed/known,
+  // and the relocation will need a R_RISCV_RELAX relocation.
   if (EnableRelax && RelaxCandidate)
     Fixups.back().setLinkerRelaxable();
   ++MCNumFixups;
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index a796c910bd44..6c8e3da80b93 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -738,7 +738,8 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
                                        MachineFunction &MF, uint64_t Offset,
                                        uint64_t RealStackSize, bool EmitCFI,
                                        bool NeedProbe, uint64_t ProbeSize,
-                                       bool DynAllocation) const {
+                                       bool DynAllocation,
+                                       MachineInstr::MIFlag Flag) const {
   DebugLoc DL;
   const RISCVRegisterInfo *RI = STI.getRegisterInfo();
   const RISCVInstrInfo *TII = STI.getInstrInfo();
@@ -748,7 +749,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   // Simply allocate the stack if it's not big enough to require a probe.
   if (!NeedProbe || Offset <= ProbeSize) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Offset),
-                  MachineInstr::FrameSetup, getStackAlign());
+                  Flag, getStackAlign());
 
     if (EmitCFI)
       CFIBuilder.buildDefCFAOffset(RealStackSize);
@@ -759,7 +760,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
     }
 
     return;
@@ -770,14 +771,13 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
     uint64_t CurrentOffset = 0;
     while (CurrentOffset + ProbeSize <= Offset) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                    StackOffset::getFixed(-ProbeSize), MachineInstr::FrameSetup,
-                    getStackAlign());
+                    StackOffset::getFixed(-ProbeSize), Flag, getStackAlign());
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
 
       CurrentOffset += ProbeSize;
       if (EmitCFI)
@@ -787,8 +787,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
     uint64_t Residual = Offset - CurrentOffset;
     if (Residual) {
       RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg,
-                    StackOffset::getFixed(-Residual), MachineInstr::FrameSetup,
-                    getStackAlign());
+                    StackOffset::getFixed(-Residual), Flag, getStackAlign());
       if (EmitCFI)
         CFIBuilder.buildDefCFAOffset(Offset);
 
@@ -798,7 +797,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
             .addReg(RISCV::X0)
             .addReg(SPReg)
             .addImm(0)
-            .setMIFlags(MachineInstr::FrameSetup);
+            .setMIFlags(Flag);
       }
     }
 
@@ -812,8 +811,7 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
   Register TargetReg = RISCV::X6;
   // SUB TargetReg, SP, RoundedSize
   RI->adjustReg(MBB, MBBI, DL, TargetReg, SPReg,
-                StackOffset::getFixed(-RoundedSize), MachineInstr::FrameSetup,
-                getStackAlign());
+                StackOffset::getFixed(-RoundedSize), Flag, getStackAlign());
 
   if (EmitCFI) {
     // Set the CFA register to TargetReg.
@@ -830,14 +828,14 @@ void RISCVFrameLowering::allocateStack(MachineBasicBlock &MBB,
 
   if (Residual) {
     RI->adjustReg(MBB, MBBI, DL, SPReg, SPReg, StackOffset::getFixed(-Residual),
-                  MachineInstr::FrameSetup, getStackAlign());
+                  Flag, getStackAlign());
     if (DynAllocation) {
       // s[d|w] zero, 0(sp)
       BuildMI(MBB, MBBI, DL, TII->get(IsRV64 ? RISCV::SD : RISCV::SW))
           .addReg(RISCV::X0)
           .addReg(SPReg)
           .addImm(0)
-          .setMIFlags(MachineInstr::FrameSetup);
+          .setMIFlags(Flag);
     }
   }
 
@@ -1034,7 +1032,8 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
       MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
   if (StackSize != 0)
     allocateStack(MBB, MBBI, MF, StackSize, RealStackSize, /*EmitCFI=*/true,
-                  NeedProbe, ProbeSize, DynAllocation);
+                  NeedProbe, ProbeSize, DynAllocation,
+                  MachineInstr::FrameSetup);
 
   // Save SiFive CLIC CSRs into Stack
   emitSiFiveCLICPreemptibleSaves(MF, MBB, MBBI, DL);
@@ -1082,7 +1081,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
     allocateStack(MBB, MBBI, MF, SecondSPAdjustAmount,
                   getStackSizeWithRVVPadding(MF), !hasFP(MF), NeedProbe,
-                  ProbeSize, DynAllocation);
+                  ProbeSize, DynAllocation, MachineInstr::FrameSetup);
   }
 
   if (RVVStackSize) {
@@ -1814,7 +1813,8 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
         bool DynAllocation =
             MF.getInfo<RISCVMachineFunctionInfo>()->hasDynamicAllocation();
         allocateStack(MBB, MI, MF, -Amount, -Amount, !hasFP(MF),
-                      /*NeedProbe=*/true, ProbeSize, DynAllocation);
+                      /*NeedProbe=*/true, ProbeSize, DynAllocation,
+                      MachineInstr::NoFlags);
       } else {
         const RISCVRegisterInfo &RI = *STI.getRegisterInfo();
         RI.adjustReg(MBB, MI, DL, SPReg, SPReg, StackOffset::getFixed(Amount),
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index d013755ce58a..6af63a4885f3 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -81,7 +81,8 @@ class RISCVFrameLowering : public TargetFrameLowering {
   void allocateStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                      MachineFunction &MF, uint64_t Offset,
                      uint64_t RealStackSize, bool EmitCFI, bool NeedProbe,
-                     uint64_t ProbeSize, bool DynAllocation) const;
+                     uint64_t ProbeSize, bool DynAllocation,
+                     MachineInstr::MIFlag Flag) const;
 
 protected:
   const RISCVSubtarget &STI;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 666c76b21e63..186191abe12a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -689,10 +689,16 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   if (!isShiftedMask_32(C1) || isInt<12>(C1))
     return false;
 
+  // INSBI will clobber the input register in N0. Bail out if we need a copy to
+  // preserve this value.
+  SDValue N0 = Node->getOperand(0);
+  if (!N0.hasOneUse())
+    return false;
+
   // If C1 is a shifted mask (but can't be formed as an ORI),
   // use a bitfield insert of -1.
   // Transform (or x, C1)
-  //        -> (qc.insbi x, width, shift)
+  //        -> (qc.insbi x, -1, width, shift)
   const unsigned Leading = llvm::countl_zero((uint32_t)C1);
   const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
   const unsigned Width = 32 - Leading - Trailing;
@@ -705,7 +711,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
 
-  SDValue Ops[] = {CurDAG->getSignedTargetConstant(-1, DL, VT),
+  SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
                    CurDAG->getTargetConstant(Width, DL, VT),
                    CurDAG->getTargetConstant(Trailing, DL, VT)};
   SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
@@ -2936,8 +2942,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
 bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
                                           SDValue &Offset) {
-  if (SelectAddrFrameIndex(Addr, Base, Offset))
-    return true;
+  // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+  // a 9-bit immediate can be folded.
 
   SDLoc DL(Addr);
   MVT VT = Addr.getSimpleValueType();
@@ -2947,8 +2953,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
     if (isUInt<9>(CVal)) {
       Base = Addr.getOperand(0);
 
-      if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+      // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+      // a 9-bit immediate can be folded.
       Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
       return true;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 9cbc364afc21..5fb16f5ac6b9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -16013,7 +16013,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   uint64_t MulAmt = CNode->getZExtValue();
 
   // Don't do this if the Xqciac extension is enabled and the MulAmt in simm12.
-  if (Subtarget.hasVendorXqciac() && isInt<12>(MulAmt))
+  if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
     return SDValue();
 
   const bool HasShlAdd = Subtarget.hasStdExtZba() ||
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 26bb1e8d1785..f39130090def 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -845,10 +845,11 @@ let Predicates = [HasVendorXqcibi, IsRV32] in {
 let Predicates = [HasVendorXqcibm, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_INSBRI : QCIRVInstRI<0b1, simm11, "qc.insbri">;
-  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
-                             (ins simm5:$imm5, uimm5_plus1:$width,
+  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+                             (ins GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width,
                              uimm5:$shamt), "qc.insbi",
                              "$rd, $imm5, $width, $shamt"> {
+    let Constraints = "$rd = $rd_wb";
     bits<5> imm5;
     bits<5> shamt;
     bits<5> width;
@@ -1376,9 +1377,9 @@ let Predicates = [HasVendorXqciac, IsRV32] in {
 def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12)>;
 def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, uimm5gt3:$imm), GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, uimm5gt3:$imm, GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs2, GPRNoX0:$rs1, uimm5gt3:$imm)>;
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 
 /// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
index 4147c97a7a23..92bc3ee8bdac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvk.td
@@ -1130,13 +1130,13 @@ let Predicates = [HasStdExtZvkned] in {
 
 let Predicates = [HasStdExtZvknha] in {
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32IntegerVectors>;
-  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32IntegerVectors>;
+  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32IntegerVectors>;
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32IntegerVectors, isSEWAware=true>;
 } // Predicates = [HasStdExtZvknha]
 
 let Predicates = [HasStdExtZvknhb] in {
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ch", "PseudoVSHA2CH", I32I64IntegerVectors>;
-  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CH", I32I64IntegerVectors>;
+  defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2cl", "PseudoVSHA2CL", I32I64IntegerVectors>;
   defm : VPatBinaryV_VV_NoMask<"int_riscv_vsha2ms", "PseudoVSHA2MS", I32I64IntegerVectors, isSEWAware=true>;
 } // Predicates = [HasStdExtZvknhb]
 
diff --git a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
index 7a2541a652b5..0d37db0138e4 100644
--- a/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMoveMerger.cpp
@@ -137,6 +137,11 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
     NextI = next_nodbg(NextI, E);
   DebugLoc DL = I->getDebugLoc();
 
+  // Make a copy so we can update the kill flag in the MoveFromAToS case. The
+  // copied operand needs to be scoped outside the if since we make a pointer
+  // to it.
+  MachineOperand PairedSource = *PairedRegs.Source;
+
   // The order of S-reg depends on which instruction holds A0, instead of
   // the order of register pair.
   // e,g.
@@ -147,8 +152,15 @@ RISCVMoveMerge::mergePairedInsns(MachineBasicBlock::iterator I,
   //   mv a1, s1    =>  cm.mva01s s2,s1
   bool StartWithX10 = ARegInFirstPair == RISCV::X10;
   if (isMoveFromAToS(Opcode)) {
-    Sreg1 = StartWithX10 ? FirstPair.Source : PairedRegs.Source;
-    Sreg2 = StartWithX10 ? PairedRegs.Source : FirstPair.Source;
+    // We are moving one of the copies earlier so its kill flag may become
+    // invalid. Clear the copied kill flag if there are any reads of the
+    // register between the new location and the old location.
+    for (auto It = std::next(I); It != Paired && PairedSource.isKill(); ++It)
+      if (It->readsRegister(PairedSource.getReg(), TRI))
+        PairedSource.setIsKill(false);
+
+    Sreg1 = StartWithX10 ? FirstPair.Source : &PairedSource;
+    Sreg2 = StartWithX10 ? &PairedSource : FirstPair.Source;
   } else {
     Sreg1 = StartWithX10 ? FirstPair.Destination : PairedRegs.Destination;
     Sreg2 = StartWithX10 ? PairedRegs.Destination : FirstPair.Destination;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 56ead92187b0..70cfb4b836cc 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1567,6 +1567,7 @@ InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   // scalarized if the legalized Src and Dst are not equal sized.
   const DataLayout &DL = this->getDataLayout();
   if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
+      !SrcLT.first.isValid() || !DstLT.first.isValid() ||
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
                            SrcLT.second.getSizeInBits()) ||
       !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index 84ef53985484..c1cc19b503de 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -434,6 +434,15 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
   if (!isKnownSameDefs(TrueMask.getReg(), MIMask.getReg()))
     return false;
 
+  // Masked off lanes past TrueVL will come from False, and converting to vmv
+  // will lose these lanes unless MIVL <= TrueVL.
+  // TODO: We could relax this for False == Passthru and True policy == TU
+  const MachineOperand &MIVL = MI.getOperand(RISCVII::getVLOpNum(MI.getDesc()));
+  const MachineOperand &TrueVL =
+      True->getOperand(RISCVII::getVLOpNum(True->getDesc()));
+  if (!RISCV::isVLKnownLE(MIVL, TrueVL))
+    return false;
+
   // True's passthru needs to be equivalent to False
   Register TruePassthruReg = True->getOperand(1).getReg();
   Register FalseReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 9b434d87c267..f81fdf0ccc82 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1799,12 +1799,14 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
   setOperationAction(ISD::FREM , MVT::f64, Expand);
-  setOperationAction(ISD::FMA  , MVT::f64, Expand);
+  setOperationAction(ISD::FMA, MVT::f64,
+                     Subtarget->isUA2007() ? Legal : Expand);
   setOperationAction(ISD::FSIN , MVT::f32, Expand);
   setOperationAction(ISD::FCOS , MVT::f32, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   setOperationAction(ISD::FREM , MVT::f32, Expand);
-  setOperationAction(ISD::FMA, MVT::f32, Expand);
+  setOperationAction(ISD::FMA, MVT::f32,
+                     Subtarget->isUA2007() ? Legal : Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -2201,7 +2203,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     SDValue Chain = DAG.getEntryNode();
     SDValue InGlue;
 
-    Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL);
+    Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
     Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InGlue);
     InGlue = Chain.getValue(1);
     SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
@@ -2219,7 +2221,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                      InGlue};
     Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
     InGlue = Chain.getValue(1);
-    Chain = DAG.getCALLSEQ_END(Chain, 1, 0, InGlue, DL);
+    Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, DL);
     InGlue = Chain.getValue(1);
     SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InGlue);
 
@@ -3550,6 +3552,11 @@ bool SparcTargetLowering::isCheapToSpeculateCttz(Type *Ty) const {
   return isCheapToSpeculateCtlz(Ty);
 }
 
+bool SparcTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                                     EVT VT) const {
+  return Subtarget->isUA2007() && !Subtarget->useSoftFloat();
+}
+
 // Override to disable global variable loading on Linux.
 void SparcTargetLowering::insertSSPDeclarations(Module &M) const {
   if (!Subtarget->isTargetLinux())
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 0d220f8c3d32..4017beb88ff3 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -177,6 +177,11 @@ namespace llvm {
 
     bool isCheapToSpeculateCttz(Type *Ty) const override;
 
+    bool enableAggressiveFMAFusion(EVT VT) const override { return true; };
+
+    bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+                                    EVT VT) const override;
+
     bool shouldInsertFencesForAtomic(const Instruction *I) const override {
       // FIXME: We insert fences for each atomics and generate
       // sub-optimal code for PSO/TSO. (Approximately nobody uses any
diff --git a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
index 3a30e552e6db..ffd4423137e3 100644
--- a/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
+++ b/llvm/lib/Target/Sparc/SparcInstrUAOSA.td
@@ -66,3 +66,15 @@ defm CXBCOND : F2_56<"cxb", 1>;
 def FPMADDX   : FourOp<"fpmaddx", 0b110111, 0b0000, DFPRegs>;
 def FPMADDXHI : FourOp<"fpmaddxhi", 0b110111, 0b0100, DFPRegs>;
 } // Predicates = [HasOSA2011]
+
+// UA2007 instruction patterns.
+let Predicates = [HasUA2007] in {
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, f32:$add)), (FMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, f64:$add)), (FMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub))), (FMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub))), (FMSUBD $rs1, $rs2, $sub)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, f32:$add))), (FNMADDS $rs1, $rs2, $add)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, f64:$add))), (FNMADDD $rs1, $rs2, $add)>;
+def : Pat<(f32 (fneg (any_fma f32:$rs1, f32:$rs2, (fneg f32:$sub)))), (FNMSUBS $rs1, $rs2, $sub)>;
+def : Pat<(f64 (fneg (any_fma f64:$rs1, f64:$rs2, (fneg f64:$sub)))), (FNMSUBD $rs1, $rs2, $sub)>;
+} // Predicates = [HasUA2007]
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index f32c9bd2bdea..2611c291abaa 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -436,20 +436,6 @@ bool SystemZTTIImpl::isLSRCostLess(
              C2.ScaleCost, C2.SetupCost);
 }
 
-bool SystemZTTIImpl::areInlineCompatible(const Function *Caller,
-                                         const Function *Callee) const {
-  const TargetMachine &TM = getTLI()->getTargetMachine();
-
-  const FeatureBitset &CallerBits =
-      TM.getSubtargetImpl(*Caller)->getFeatureBits();
-  const FeatureBitset &CalleeBits =
-      TM.getSubtargetImpl(*Callee)->getFeatureBits();
-
-  // Support only equal feature bitsets. Restriction should be relaxed in the
-  // future to allow inlining when callee's bits are subset of the caller's.
-  return CallerBits == CalleeBits;
-}
-
 unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   bool Vector = (ClassID == 1);
   if (!Vector)
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index dc5736e8af00..fc681dec1859 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -65,9 +65,6 @@ class SystemZTTIImpl final : public BasicTTIImplBase<SystemZTTIImpl> {
   bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
                      const TargetTransformInfo::LSRCost &C2) const override;
 
-  bool areInlineCompatible(const Function *Caller,
-                           const Function *Callee) const override;
-
   /// @}
 
   /// \name Vector TTI Implementations
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 2662241ef849..e6486e247209 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -256,9 +256,17 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
 
   // Precompute the set of registers that are unused, so that we can insert
   // drops to their defs.
+  // And unstackify any stackified registers that don't have any uses, so that
+  // they can be dropped later. This can happen when transformations after
+  // RegStackify remove instructions using stackified registers.
   BitVector UseEmpty(MRI.getNumVirtRegs());
-  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I)
-    UseEmpty[I] = MRI.use_empty(Register::index2VirtReg(I));
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
+    Register Reg = Register::index2VirtReg(I);
+    if (MRI.use_empty(Reg)) {
+      UseEmpty[I] = true;
+      MFI.unstackifyVReg(Reg);
+    }
+  }
 
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index b642c1cfe383..8213e512f45e 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1042,8 +1042,8 @@ class X86AsmParser : public MCTargetAsmParser {
       }
       PrevState = CurrState;
     }
-    void onRParen() {
-      PrevState = State;
+    bool onRParen(StringRef &ErrMsg) {
+      IntelExprState CurrState = State;
       switch (State) {
       default:
         State = IES_ERROR;
@@ -1054,9 +1054,27 @@ class X86AsmParser : public MCTargetAsmParser {
       case IES_RBRAC:
       case IES_RPAREN:
         State = IES_RPAREN;
+        // In the case of a multiply, onRegister has already set IndexReg
+        // directly, with appropriate scale.
+        // Otherwise if we just saw a register it has only been stored in
+        // TmpReg, so we need to store it into the state machine.
+        if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
+          // If we already have a BaseReg, then assume this is the IndexReg with
+          // no explicit scale.
+          if (!BaseReg) {
+            BaseReg = TmpReg;
+          } else {
+            if (IndexReg)
+              return regsUseUpError(ErrMsg);
+            IndexReg = TmpReg;
+            Scale = 0;
+          }
+        }
         IC.pushOperator(IC_RPAREN);
         break;
       }
+      PrevState = CurrState;
+      return false;
     }
     bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
                   const InlineAsmIdentifierInfo &IDInfo,
@@ -2172,7 +2190,11 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       }
       break;
     case AsmToken::LParen:  SM.onLParen(); break;
-    case AsmToken::RParen:  SM.onRParen(); break;
+    case AsmToken::RParen:
+      if (SM.onRParen(ErrMsg)) {
+        return Error(Tok.getLoc(), ErrMsg);
+      }
+      break;
     }
     if (SM.hadError())
       return Error(Tok.getLoc(), "unknown token in expression");
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 0b4c63f7a81f..5d5a70589324 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -374,5 +374,36 @@ static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return true;
 }
 
+/// Special handling for i128 and fp128: on x86-32, i128 and fp128 get legalized
+/// as four i32s, but fp128 must be passed on the stack with 16-byte alignment.
+/// Technically only fp128 has a specified ABI, but it makes sense to handle
+/// i128 the same until we hear differently.
+static bool CC_X86_32_I128_FP128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                 CCValAssign::LocInfo &LocInfo,
+                                 ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(ValVT == MVT::i32 && "Should have i32 parts");
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  assert(PendingMembers.size() == 4 && "Should have four parts");
+
+  int64_t Offset = State.AllocateStack(16, Align(16));
+  PendingMembers[0].convertToMem(Offset);
+  PendingMembers[1].convertToMem(Offset + 4);
+  PendingMembers[2].convertToMem(Offset + 8);
+  PendingMembers[3].convertToMem(Offset + 12);
+
+  State.addLoc(PendingMembers[0]);
+  State.addLoc(PendingMembers[1]);
+  State.addLoc(PendingMembers[2]);
+  State.addLoc(PendingMembers[3]);
+  PendingMembers.clear();
+  return true;
+}
+
 // Provides entry points of CC_X86 and RetCC_X86.
 #include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 823e0caa0226..f020e0b55141 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -859,6 +859,11 @@ def CC_X86_32_C : CallingConv<[
   // The 'nest' parameter, if any, is passed in ECX.
   CCIfNest<CCAssignToReg<[ECX]>>,
 
+  // i128 and fp128 need to be passed on the stack with a higher alignment than
+  // their legal types. Handle this with a custom function.
+  CCIfType<[i32],
+           CCIfConsecutiveRegs<CCCustom<"CC_X86_32_I128_FP128">>>,
+
   // On swifttailcc pass swiftself in ECX.
   CCIfCC<"CallingConv::SwiftTail",
          CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[ECX]>>>>,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5e35d5630d66..86877be48eca 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -4997,9 +4997,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
 
   EVT VT = Op.getValueType();
   unsigned SizeInBits = VT.getSizeInBits();
-  assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
+  // Can't split constant.
+  if ((SizeInBits % EltSizeInBits) != 0)
+    return false;
+
   // Bitcast a source array of element bits to the target size.
   auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
     unsigned NumSrcElts = UndefSrcElts.getBitWidth();
@@ -15400,18 +15403,18 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
         return SDValue();
     }
 
-    // Avoid returning the same shuffle operation. For example,
-    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
-    //                             undef:v16i16
-    if (CrossLaneMask == Mask || InLaneMask == Mask)
-      return SDValue();
-
     // Simplify CrossLaneMask based on the actual demanded elements.
     if (V1.hasOneUse())
       for (int i = 0; i != NumElts; ++i)
         if (!DemandedCrossLane[i])
           CrossLaneMask[i] = SM_SentinelUndef;
 
+    // Avoid returning the same shuffle operation. For example,
+    // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5,
+    //                             undef:v16i16
+    if (CrossLaneMask == Mask || InLaneMask == Mask)
+      return SDValue();
+
     SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask);
     return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT),
                                 InLaneMask);
@@ -26233,10 +26236,9 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
                                     SDValue PreservedSrc,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
-
-  if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
-    if (MaskConst->getZExtValue() & 0x1)
-      return Op;
+  auto *MaskConst = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskConst && (MaskConst->getZExtValue() & 0x1))
+    return Op;
 
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -26252,6 +26254,17 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
 
   if (PreservedSrc.isUndef())
     PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
+
+  if (MaskConst) {
+    assert((MaskConst->getZExtValue() & 0x1) == 0 && "Expected false mask");
+    // Discard op and blend passthrough with scalar op src/dst.
+    SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
+    std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+    ShuffleMask[0] = VT.getVectorNumElements();
+    return DAG.getVectorShuffle(VT, dl, Op.getOperand(0), PreservedSrc,
+                                ShuffleMask);
+  }
+
   return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
 }
 
@@ -44175,8 +44188,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     }
       // Conversions.
       // TODO: Add more CVT opcodes when we have test coverage.
-    case X86ISD::CVTTP2SI:
     case X86ISD::CVTTP2UI: {
+      if (!Subtarget.hasVLX())
+        break;
+      [[fallthrough]];
+    }
+    case X86ISD::CVTTP2SI: {
       if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f16 &&
           !Subtarget.hasVLX())
         break;
@@ -51758,6 +51775,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SDValue X, Y;
     EVT CondVT = VT.changeVectorElementType(MVT::i1);
     if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(CondVT) &&
+        (VT.is512BitVector() || Subtarget.hasVLX()) &&
+        (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
         sd_match(N, m_And(m_Value(X),
                           m_OneUse(m_SExt(m_AllOf(
                               m_Value(Y), m_SpecificVT(CondVT),
@@ -55312,6 +55331,8 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     SDValue Src = N0.getOperand(0);
     EVT SrcVT = Src.getValueType();
     if (Src.getOpcode() == ISD::SETCC && SrcVT.getScalarType() == MVT::i1 &&
+        (VT.is512BitVector() || Subtarget.hasVLX()) &&
+        (VT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
         TLI.isTypeLegal(SrcVT) && N0.hasOneUse() && Src.hasOneUse())
       return DAG.getSelect(DL, VT, DAG.getNOT(DL, Src, SrcVT), N1,
                            getZeroVector(VT, Subtarget, DAG, DL));
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 9ad355311527..b4639ac2577e 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -237,9 +237,18 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
 bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg,
     const DataLayout &DL) const {
-  // i128 split into i64 needs to be allocated to two consecutive registers,
-  // or spilled to the stack as a whole.
-  return Ty->isIntegerTy(128);
+  // On x86-64 i128 is split into two i64s and needs to be allocated to two
+  // consecutive registers, or spilled to the stack as a whole. On x86-32 i128
+  // is split to four i32s and never actually passed in registers, but we use
+  // the consecutive register mark to match it in TableGen.
+  if (Ty->isIntegerTy(128))
+    return true;
+
+  // On x86-32, fp128 acts the same as i128.
+  if (Subtarget.is32Bit() && Ty->isFP128Ty())
+    return true;
+
+  return false;
 }
 
 /// Helper for getByValTypeAlignment to determine
diff --git a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
index fd42fd2e010b..f847ddb46af7 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelLowering.cpp
@@ -232,12 +232,6 @@ XtensaTargetLowering::XtensaTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
-
-    setCondCodeAction(ISD::SETOGT, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
-    setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
   } else {
     setOperationAction(ISD::BITCAST, MVT::i32, Expand);
     setOperationAction(ISD::BITCAST, MVT::f32, Expand);
@@ -877,6 +871,16 @@ static std::pair<unsigned, unsigned> getFPBranchKind(ISD::CondCode Cond) {
     return std::make_pair(Xtensa::BF, Xtensa::OLT_S);
   case ISD::SETGT:
     return std::make_pair(Xtensa::BF, Xtensa::OLE_S);
+  case ISD::SETOGT:
+    return std::make_pair(Xtensa::BF, Xtensa::ULE_S);
+  case ISD::SETOGE:
+    return std::make_pair(Xtensa::BF, Xtensa::ULT_S);
+  case ISD::SETONE:
+    return std::make_pair(Xtensa::BF, Xtensa::UEQ_S);
+  case ISD::SETUGT:
+    return std::make_pair(Xtensa::BF, Xtensa::OLE_S);
+  case ISD::SETUGE:
+    return std::make_pair(Xtensa::BF, Xtensa::OLT_S);
   default:
     llvm_unreachable("Invalid condition!");
   }
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 0584c941d2e6..a97cfe7d08d2 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -271,6 +271,8 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
   case PC: return "pc";
   case SCEI: return "scei";
   case SUSE: return "suse";
+  case Amazon:
+    return "amazon";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -669,6 +671,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
       .Case("suse", Triple::SUSE)
       .Case("oe", Triple::OpenEmbedded)
       .Case("intel", Triple::Intel)
+      .Case("amazon", Triple::Amazon)
       .Default(Triple::UnknownVendor);
 }
 
@@ -1301,6 +1304,13 @@ std::string Triple::normalize(StringRef Str, CanonicalForm Form) {
   if (Vendor == Triple::SUSE && Environment == llvm::Triple::GNUEABI)
     Components[3] = "gnueabihf";
 
+  // Amazon Linux uses a "gnu" environment by default.
+  if (Environment == Triple::UnknownEnvironment && Vendor == Triple::Amazon &&
+      OS == Triple::Linux) {
+    Components.resize(4);
+    Components[3] = "gnu";
+  }
+
   if (OS == Triple::Win32) {
     Components.resize(4);
     Components[2] = "windows";
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 59ae057cae79..ac93f748ce65 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -85,6 +85,9 @@ static Intrinsic::ID NonOverloadedCoroIntrinsics[] = {
     Intrinsic::coro_id_async,
     Intrinsic::coro_id_retcon,
     Intrinsic::coro_id_retcon_once,
+    Intrinsic::coro_noop,
+    Intrinsic::coro_prepare_async,
+    Intrinsic::coro_prepare_retcon,
     Intrinsic::coro_promise,
     Intrinsic::coro_resume,
     Intrinsic::coro_save,
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index e276376f2158..0d631734ca96 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -350,12 +350,20 @@ void splitAndWriteThinLTOBitcode(
       });
     }
 
+  auto MustEmitToMergedModule = [](const GlobalValue *GV) {
+    // The __cfi_check definition is filled in by the CrossDSOCFI pass which
+    // runs only in the merged module.
+    return GV->getName() == "__cfi_check";
+  };
+
   ValueToValueMapTy VMap;
   std::unique_ptr<Module> MergedM(
       CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
         if (const auto *C = GV->getComdat())
           if (MergedMComdats.count(C))
             return true;
+        if (MustEmitToMergedModule(GV))
+          return true;
         if (auto *F = dyn_cast<Function>(GV))
           return EligibleVirtualFns.count(F);
         if (auto *GVar =
@@ -372,7 +380,7 @@ void splitAndWriteThinLTOBitcode(
   cloneUsedGlobalVariables(M, *MergedM, /*CompilerUsed*/ true);
 
   for (Function &F : *MergedM)
-    if (!F.isDeclaration()) {
+    if (!F.isDeclaration() && !MustEmitToMergedModule(&F)) {
       // Reset the linkage of all functions eligible for virtual constant
       // propagation. The canonical definitions live in the thin LTO module so
       // that they can be imported.
@@ -394,6 +402,8 @@ void splitAndWriteThinLTOBitcode(
     if (const auto *C = GV->getComdat())
       if (MergedMComdats.count(C))
         return false;
+    if (MustEmitToMergedModule(GV))
+      return false;
     return true;
   });
 
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 85dd9a1bf716..0f63ed0166cf 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2079,6 +2079,7 @@ struct DSEState {
       AllocFnKind AllocKind =
           Attrs.getFnAttr(Attribute::AllocKind).getAllocKind() |
           AllocFnKind::Zeroed;
+      AllocKind &= ~AllocFnKind::Uninitialized;
       Attrs =
           Attrs.addFnAttribute(Ctx, Attribute::getWithAllocKind(Ctx, AllocKind))
               .removeFnAttribute(Ctx, "alloc-variant-zeroed");
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index 221094f170ac..b9546c5fa236 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -128,6 +128,8 @@ class ConstantTerminatorFoldingImpl {
   // from any other block. So this variable set to true means that loop's latch
   // has become unreachable from loop header.
   bool DeleteCurrentLoop = false;
+  // Whether or not we enter the loop through an indirectbr.
+  bool HasIndirectEntry = false;
 
   // The blocks of the original loop that will still be reachable from entry
   // after the constant folding.
@@ -216,6 +218,19 @@ class ConstantTerminatorFoldingImpl {
       return;
     }
 
+    // We need a loop preheader to split in handleDeadExits(). If LoopSimplify
+    // wasn't able to form one because the loop can be entered through an
+    // indirectbr we cannot continue.
+    if (!L.getLoopPreheader()) {
+      assert(any_of(predecessors(L.getHeader()),
+                    [&](BasicBlock *Pred) {
+                      return isa<IndirectBrInst>(Pred->getTerminator());
+                    }) &&
+             "Loop should have preheader if it is not entered indirectly");
+      HasIndirectEntry = true;
+      return;
+    }
+
     // Collect live and dead loop blocks and exits.
     LiveLoopBlocks.insert(L.getHeader());
     for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
@@ -546,6 +561,12 @@ class ConstantTerminatorFoldingImpl {
       return false;
     }
 
+    if (HasIndirectEntry) {
+      LLVM_DEBUG(dbgs() << "Loops which can be entered indirectly are not"
+                           " supported!\n");
+      return false;
+    }
+
     // Nothing to constant-fold.
     if (FoldCandidates.empty()) {
       LLVM_DEBUG(
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 70b4552190a4..f537b0d3fbd7 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1258,8 +1258,7 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
            "Map index doesn't point back to a slice with this user.");
   }
 
-  // Disable SRoA for any intrinsics except for lifetime invariants and
-  // invariant group.
+  // Disable SRoA for any intrinsics except for lifetime invariants.
   // FIXME: What about debug intrinsics? This matches old behavior, but
   // doesn't make sense.
   void visitIntrinsicInst(IntrinsicInst &II) {
@@ -1279,12 +1278,6 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
       return;
     }
 
-    if (II.isLaunderOrStripInvariantGroup()) {
-      insertUse(II, Offset, AllocSize, true);
-      enqueueUsers(II);
-      return;
-    }
-
     Base::visitIntrinsicInst(II);
   }
 
@@ -3618,8 +3611,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
   }
 
   bool visitIntrinsicInst(IntrinsicInst &II) {
-    assert((II.isLifetimeStartOrEnd() || II.isLaunderOrStripInvariantGroup() ||
-            II.isDroppable()) &&
+    assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
            "Unexpected intrinsic!");
     LLVM_DEBUG(dbgs() << "    original: " << II << "\n");
 
@@ -3633,9 +3625,6 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       return true;
     }
 
-    if (II.isLaunderOrStripInvariantGroup())
-      return true;
-
     assert(II.getArgOperand(1) == OldPtr);
     // Lifetime intrinsics are only promotable if they cover the whole alloca.
     // Therefore, we drop lifetime intrinsics which don't cover the whole
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index ddd203f3acf7..42b1fdf17f38 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -111,15 +111,14 @@ BasicBlock *
 llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
                              const CriticalEdgeSplittingOptions &Options,
                              const Twine &BBName) {
-  assert(!isa<IndirectBrInst>(TI) &&
-         "Cannot split critical edge from IndirectBrInst");
-
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
-  // Splitting the critical edge to a pad block is non-trivial. Don't do
-  // it in this generic function.
-  if (DestBB->isEHPad()) return nullptr;
+  // Splitting the critical edge to a pad block is non-trivial.
+  // And we cannot split block with IndirectBr as a terminator.
+  // Don't do it in this generic function.
+  if (DestBB->isEHPad() || isa<IndirectBrInst>(TI))
+    return nullptr;
 
   if (Options.IgnoreUnreachableDests &&
       isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 200d1fb85415..e7623aaff105 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -938,8 +938,10 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
   case RecurKind::UMin:
     return Intrinsic::vector_reduce_umin;
   case RecurKind::FMax:
+  case RecurKind::FMaxNum:
     return Intrinsic::vector_reduce_fmax;
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
     return Intrinsic::vector_reduce_fmin;
   case RecurKind::FMaximum:
     return Intrinsic::vector_reduce_fmaximum;
@@ -1037,8 +1039,10 @@ Intrinsic::ID llvm::getMinMaxReductionIntrinsicOp(RecurKind RK) {
   case RecurKind::SMax:
     return Intrinsic::smax;
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
     return Intrinsic::minnum;
   case RecurKind::FMax:
+  case RecurKind::FMaxNum:
     return Intrinsic::maxnum;
   case RecurKind::FMinimum:
     return Intrinsic::minimum;
@@ -1096,9 +1100,9 @@ Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
                             Value *Right) {
   Type *Ty = Left->getType();
   if (Ty->isIntOrIntVectorTy() ||
-      (RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
+      (RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum ||
+       RK == RecurKind::FMinimum || RK == RecurKind::FMaximum ||
        RK == RecurKind::FMinimumNum || RK == RecurKind::FMaximumNum)) {
-    // TODO: Add float minnum/maxnum support when FMF nnan is set.
     Intrinsic::ID Id = getMinMaxReductionIntrinsicOp(RK);
     return Builder.CreateIntrinsic(Ty, Id, {Left, Right}, nullptr,
                                    "rdx.minmax");
@@ -1308,6 +1312,8 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
   case RecurKind::UMin:
   case RecurKind::FMax:
   case RecurKind::FMin:
+  case RecurKind::FMinNum:
+  case RecurKind::FMaxNum:
   case RecurKind::FMinimum:
   case RecurKind::FMaximum:
   case RecurKind::FMinimumNum:
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 24fe08d6c3e4..bf457194bfd8 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2133,8 +2133,15 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
   // negative. If Step is known to be positive or negative, only create
   // either 1. or 2.
   auto ComputeEndCheck = [&]() -> Value * {
-    // Checking <u 0 is always false.
-    if (!Signed && Start->isZero() && SE.isKnownPositive(Step))
+    // Checking <u 0 is always false, if (Step * trunc ExitCount) does not wrap.
+    // TODO: Predicates that can be proven true/false should be discarded when
+    // the predicates are created, not late during expansion.
+    if (!Signed && Start->isZero() && SE.isKnownPositive(Step) &&
+        DstBits < SrcBits &&
+        ExitCount == SE.getZeroExtendExpr(SE.getTruncateExpr(ExitCount, ARTy),
+                                          ExitCount->getType()) &&
+        SE.willNotOverflow(Instruction::Mul, Signed, Step,
+                           SE.getTruncateExpr(ExitCount, ARTy)))
       return ConstantInt::getFalse(Loc->getContext());
 
     // Get the backedge taken count and truncate or extended to the AR type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 11853859484e..f57ce0c3ccb4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -230,7 +230,6 @@ class VPBuilder {
 
   /// Create a new ICmp VPInstruction with predicate \p Pred and operands \p A
   /// and \p B.
-  /// TODO: add createFCmp when needed.
   VPInstruction *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
                             DebugLoc DL = DebugLoc::getUnknown(),
                             const Twine &Name = "") {
@@ -240,6 +239,17 @@ class VPBuilder {
         new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name));
   }
 
+  /// Create a new FCmp VPInstruction with predicate \p Pred and operands \p A
+  /// and \p B.
+  VPInstruction *createFCmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B,
+                            DebugLoc DL = DebugLoc::getUnknown(),
+                            const Twine &Name = "") {
+    assert(Pred >= CmpInst::FIRST_FCMP_PREDICATE &&
+           Pred <= CmpInst::LAST_FCMP_PREDICATE && "invalid predicate");
+    return tryInsertInstruction(
+        new VPInstruction(Instruction::FCmp, {A, B}, Pred, DL, Name));
+  }
+
   VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset,
                               DebugLoc DL = DebugLoc::getUnknown(),
                               const Twine &Name = "") {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 06db89a89bc3..5cf4b1651538 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4345,10 +4345,14 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
     ElementCount VF) const {
-  // Cross iteration phis such as reductions need special handling and are
-  // currently unsupported.
-  if (any_of(OrigLoop->getHeader()->phis(),
-             [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
+  // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
+  // reductions need special handling and are currently unsupported.
+  if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
+        if (!Legal->isReductionVariable(&Phi))
+          return Legal->isFixedOrderRecurrence(&Phi);
+        RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
+        return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+      }))
     return false;
 
   // Phis with uses outside of the loop require special handling and are
@@ -8817,6 +8821,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   // Adjust the recipes for any inloop reductions.
   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
 
+  // Apply mandatory transformation to handle FP maxnum/minnum reduction with
+  // NaNs if possible, bail out otherwise.
+  if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
+                                *Plan))
+    return nullptr;
+
   // Transform recipes to abstract recipes if it is legal and beneficial and
   // clamp the range for better cost estimation.
   // TODO: Enable following transform when the EVL-version of extended-reduction
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 31aec77db63c..2d50981690f4 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9968,7 +9968,10 @@ BoUpSLP::getScalarsVectorizationLegality(ArrayRef<Value *> VL, unsigned Depth,
       }
       SmallPtrSet<Value *, 8> Values(llvm::from_range, E->Scalars);
       if (all_of(VL, [&](Value *V) {
-            return isa<PoisonValue>(V) || Values.contains(V);
+            return isa<PoisonValue>(V) || Values.contains(V) ||
+                   (S.getOpcode() == Instruction::PHI && isa<PHINode>(V) &&
+                    LI->getLoopFor(S.getMainOp()->getParent()) &&
+                    isVectorized(V));
           })) {
         LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
         return ScalarsVectorizationLegality(S, /*IsLegal=*/false);
@@ -23196,6 +23199,8 @@ class HorizontalReduction {
         case RecurKind::FindFirstIVUMin:
         case RecurKind::FindLastIVSMax:
         case RecurKind::FindLastIVUMax:
+        case RecurKind::FMaxNum:
+        case RecurKind::FMinNum:
         case RecurKind::FMaximumNum:
         case RecurKind::FMinimumNum:
         case RecurKind::None:
@@ -23333,6 +23338,8 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::FMaxNum:
+    case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
@@ -23435,6 +23442,8 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::FMaxNum:
+    case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
     case RecurKind::FMinimumNum:
     case RecurKind::None:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index b27a7ffeed20..66657b98b094 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -84,6 +84,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
     return ResTy;
   }
   case Instruction::ICmp:
+  case Instruction::FCmp:
   case VPInstruction::ActiveLaneMask:
     assert(inferScalarType(R->getOperand(0)) ==
                inferScalarType(R->getOperand(1)) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 52eecb000d0c..a7a22e042aef 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -628,3 +628,163 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
     Term->addMetadata(LLVMContext::MD_prof, BranchWeights);
   }
 }
+
+bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
+  auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
+    auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
+        RedPhiR->getBackedgeValue()->getDefiningRecipe());
+    if (!MinMaxR)
+      return nullptr;
+
+    auto *RepR = dyn_cast<VPReplicateRecipe>(MinMaxR);
+    if (!isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+        !(RepR && isa<IntrinsicInst>(RepR->getUnderlyingInstr())))
+      return nullptr;
+
+#ifndef NDEBUG
+    Intrinsic::ID RdxIntrinsicId =
+        RedPhiR->getRecurrenceKind() == RecurKind::FMaxNum ? Intrinsic::maxnum
+                                                           : Intrinsic::minnum;
+    assert((isa<VPWidenIntrinsicRecipe>(MinMaxR) &&
+            cast<VPWidenIntrinsicRecipe>(MinMaxR)->getVectorIntrinsicID() ==
+                RdxIntrinsicId) ||
+           (RepR &&
+            cast<IntrinsicInst>(RepR->getUnderlyingInstr())->getIntrinsicID() ==
+                RdxIntrinsicId) &&
+               "Intrinsic did not match recurrence kind");
+#endif
+
+    if (MinMaxR->getOperand(0) == RedPhiR)
+      return MinMaxR->getOperand(1);
+
+    assert(MinMaxR->getOperand(1) == RedPhiR &&
+           "Reduction phi operand expected");
+    return MinMaxR->getOperand(0);
+  };
+
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPReductionPHIRecipe *RedPhiR = nullptr;
+  bool HasUnsupportedPhi = false;
+  for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
+    if (isa<VPCanonicalIVPHIRecipe, VPWidenIntOrFpInductionRecipe>(&R))
+      continue;
+    auto *Cur = dyn_cast<VPReductionPHIRecipe>(&R);
+    if (!Cur) {
+      // TODO: Also support fixed-order recurrence phis.
+      HasUnsupportedPhi = true;
+      continue;
+    }
+    // For now, only a single reduction is supported.
+    // TODO: Support multiple MaxNum/MinNum reductions and other reductions.
+    if (RedPhiR)
+      return false;
+    if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
+        Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+      HasUnsupportedPhi = true;
+      continue;
+    }
+    RedPhiR = Cur;
+  }
+
+  if (!RedPhiR)
+    return true;
+
+  // We won't be able to resume execution in the scalar tail, if there are
+  // unsupported header phis or there is no scalar tail at all, due to
+  // tail-folding.
+  if (HasUnsupportedPhi || !Plan.hasScalarTail())
+    return false;
+
+  VPValue *MinMaxOp = GetMinMaxCompareValue(RedPhiR);
+  if (!MinMaxOp)
+    return false;
+
+  RecurKind RedPhiRK = RedPhiR->getRecurrenceKind();
+  assert((RedPhiRK == RecurKind::FMaxNum || RedPhiRK == RecurKind::FMinNum) &&
+         "unsupported reduction");
+
+  /// Check if the vector loop of \p Plan can early exit and restart
+  /// execution of last vector iteration in the scalar loop. This requires all
+  /// recipes up to early exit point be side-effect free as they are
+  /// re-executed. Currently we check that the loop is free of any recipe that
+  /// may write to memory. Expected to operate on an early VPlan w/o nested
+  /// regions.
+  for (VPBlockBase *VPB : vp_depth_first_shallow(
+           Plan.getVectorLoopRegion()->getEntryBasicBlock())) {
+    auto *VPBB = cast<VPBasicBlock>(VPB);
+    for (auto &R : *VPBB) {
+      if (R.mayWriteToMemory() &&
+          !match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())))
+        return false;
+    }
+  }
+
+  VPBasicBlock *LatchVPBB = LoopRegion->getExitingBasicBlock();
+  VPBuilder Builder(LatchVPBB->getTerminator());
+  auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
+  assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
+         "Unexpected terminator");
+  auto *IsLatchExitTaken =
+      Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
+                         LatchExitingBranch->getOperand(1));
+
+  VPValue *IsNaN = Builder.createFCmp(CmpInst::FCMP_UNO, MinMaxOp, MinMaxOp);
+  VPValue *AnyNaN = Builder.createNaryOp(VPInstruction::AnyOf, {IsNaN});
+  auto *AnyExitTaken =
+      Builder.createNaryOp(Instruction::Or, {AnyNaN, IsLatchExitTaken});
+  Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
+  LatchExitingBranch->eraseFromParent();
+
+  // If we exit early due to NaNs, compute the final reduction result based on
+  // the reduction phi at the beginning of the last vector iteration.
+  auto *RdxResult = find_singleton<VPSingleDefRecipe>(
+      RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+          return VPI;
+        return nullptr;
+      });
+
+  auto *MiddleVPBB = Plan.getMiddleBlock();
+  Builder.setInsertPoint(MiddleVPBB, MiddleVPBB->begin());
+  auto *NewSel =
+      Builder.createSelect(AnyNaN, RedPhiR, RdxResult->getOperand(1));
+  RdxResult->setOperand(1, NewSel);
+
+  auto *ScalarPH = Plan.getScalarPreheader();
+  // Update resume phis for inductions in the scalar preheader. If AnyNaN is
+  // true, the resume from the start of the last vector iteration via the
+  // canonical IV, otherwise from the original value.
+  for (auto &R : ScalarPH->phis()) {
+    auto *ResumeR = cast<VPPhi>(&R);
+    VPValue *VecV = ResumeR->getOperand(0);
+    if (VecV == RdxResult)
+      continue;
+    if (auto *DerivedIV = dyn_cast<VPDerivedIVRecipe>(VecV)) {
+      if (DerivedIV->getNumUsers() == 1 &&
+          DerivedIV->getOperand(1) == &Plan.getVectorTripCount()) {
+        auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(),
+                                            &Plan.getVectorTripCount());
+        DerivedIV->moveAfter(&*Builder.getInsertPoint());
+        DerivedIV->setOperand(1, NewSel);
+        continue;
+      }
+    }
+    // Bail out and abandon the current, partially modified, VPlan if we
+    // encounter resume phi that cannot be updated yet.
+    if (VecV != &Plan.getVectorTripCount()) {
+      LLVM_DEBUG(dbgs() << "Found resume phi we cannot update for VPlan with "
+                           "FMaxNum/FMinNum reduction.\n");
+      return false;
+    }
+    auto *NewSel = Builder.createSelect(AnyNaN, Plan.getCanonicalIV(), VecV);
+    ResumeR->setOperand(0, NewSel);
+  }
+
+  auto *MiddleTerm = MiddleVPBB->getTerminator();
+  Builder.setInsertPoint(MiddleTerm);
+  VPValue *MiddleCond = MiddleTerm->getOperand(0);
+  VPValue *NewCond = Builder.createAnd(MiddleCond, Builder.createNot(AnyNaN));
+  MiddleTerm->setOperand(0, NewCond);
+  return true;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1664bcc3881a..57b713d3dfcb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -587,6 +587,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
     return Builder.CreateFreeze(Op, Name);
   }
+  case Instruction::FCmp:
   case Instruction::ICmp: {
     bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
     Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -860,7 +861,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
     Value *Res = State.get(getOperand(0));
     for (VPValue *Op : drop_begin(operands()))
       Res = Builder.CreateOr(Res, State.get(Op));
-    return Builder.CreateOrReduce(Res);
+    return State.VF.isScalar() ? Res : Builder.CreateOrReduce(Res);
   }
   case VPInstruction::FirstActiveLane: {
     if (getNumOperands() == 1) {
@@ -1033,6 +1034,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
   switch (getOpcode()) {
   case Instruction::ExtractElement:
   case Instruction::Freeze:
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case VPInstruction::AnyOf:
@@ -1068,6 +1070,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
     return Op == getOperand(1);
   case Instruction::PHI:
     return true;
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
   case Instruction::Or:
@@ -1100,6 +1103,7 @@ bool VPInstruction::onlyFirstPartUsed(const VPValue *Op) const {
   switch (getOpcode()) {
   default:
     return false;
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
     return vputils::onlyFirstPartUsed(this);
@@ -1786,7 +1790,7 @@ bool VPIRFlags::flagsValidForOpcode(unsigned Opcode) const {
     return Opcode == Instruction::ZExt;
     break;
   case OperationType::Cmp:
-    return Opcode == Instruction::ICmp;
+    return Opcode == Instruction::FCmp || Opcode == Instruction::ICmp;
   case OperationType::Other:
     return true;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 870b1bb68b79..c3fb359d1429 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -99,6 +99,12 @@ struct VPlanTransforms {
   /// not valid.
   static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
 
+  /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
+  /// try to update the vector loop to exit early if any input is NaN and resume
+  /// executing in the scalar loop to handle the NaNs there. Return false if
+  /// this attempt was unsuccessful.
+  static bool handleMaxMinNumReductions(VPlan &Plan);
+
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   static void clearReductionWrapFlags(VPlan &Plan);
 
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index fe8d74c43dfd..639f8686a271 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -74,7 +74,7 @@ class VectorCombine {
                 const DataLayout *DL, TTI::TargetCostKind CostKind,
                 bool TryEarlyFoldsOnly)
       : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
-        DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind),
+        DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
         TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
 
   bool run();
@@ -88,6 +88,7 @@ class VectorCombine {
   AssumptionCache &AC;
   const DataLayout *DL;
   TTI::TargetCostKind CostKind;
+  const SimplifyQuery SQ;
 
   /// If true, only perform beneficial early IR transforms. Do not introduce new
   /// vector operations.
@@ -1185,17 +1186,18 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   // Fold the vector constants in the original vectors into a new base vector to
   // get more accurate cost modelling.
   Value *NewVecC = nullptr;
-  TargetFolder Folder(*DL);
   if (CI)
-    NewVecC = Folder.FoldCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
+    NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
   else if (UO)
     NewVecC =
-        Folder.FoldUnOpFMF(UO->getOpcode(), VecCs[0], UO->getFastMathFlags());
+        simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
   else if (BO)
-    NewVecC = Folder.FoldBinOp(BO->getOpcode(), VecCs[0], VecCs[1]);
-  else if (II->arg_size() == 2)
-    NewVecC = Folder.FoldBinaryIntrinsic(II->getIntrinsicID(), VecCs[0],
-                                         VecCs[1], II->getType(), &I);
+    NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
+  else if (II)
+    NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
+
+  if (!NewVecC)
+    return false;
 
   // Get cost estimate for the insert element. This cost will factor into
   // both sequences.
@@ -1203,6 +1205,7 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   InstructionCost NewCost =
       ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
                                             CostKind, *Index, NewVecC);
+
   for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
     if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
                               II->getIntrinsicID(), Idx, &TTI)))
@@ -1247,15 +1250,6 @@ bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
   if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
     ScalarInst->copyIRFlags(&I);
 
-  // Create a new base vector if the constant folding failed.
-  if (!NewVecC) {
-    if (CI)
-      NewVecC = Builder.CreateCmp(CI->getPredicate(), VecCs[0], VecCs[1]);
-    else if (UO || BO)
-      NewVecC = Builder.CreateNAryOp(Opcode, VecCs);
-    else
-      NewVecC = Builder.CreateIntrinsic(VecTy, II->getIntrinsicID(), VecCs);
-  }
   Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
   replaceValue(I, *Insert);
   return true;
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
index b484f8f6c60b..21e0356fd732 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
index 519b8ecf6dc7..27dd42297bfa 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
index 2a8609d2f418..826605450a2d 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-xor.ll
@@ -13,8 +13,8 @@ define void @reduce() {
 ; CHECK-NEXT:  Cost Model: Found costs of 3 for: %V32 = call i1 @llvm.vector.reduce.xor.v32i1(<32 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %V64 = call i1 @llvm.vector.reduce.xor.v64i1(<64 x i1> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 9 for: %V128 = call i1 @llvm.vector.reduce.xor.v128i1(<128 x i1> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 0 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
-; CHECK-NEXT:  Cost Model: Found costs of 4 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %V1i8 = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:5 Lat:6 SizeLat:6 for: %V3i8 = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 7 for: %V4i8 = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 15 for: %V8i8 = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> undef)
 ; CHECK-NEXT:  Cost Model: Found costs of 17 for: %V16i8 = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
index 41c272291d7c..4579acb9b355 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-other.ll
@@ -93,36 +93,36 @@ define void @insert_subvec() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_1 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_2 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i8_2_3 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i8_2_05 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_0 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_2 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i8_4_3 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i8_4_05 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_0 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i16_2_1 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_0 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_1 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_2 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i16_2_3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i16_2_05 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_0 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_1 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_2 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i16_4_3 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i16_4_05 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_0 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4i32_2_1 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_0 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_1 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_2 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8i32_2_3 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v8i32_2_05 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_0 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_1 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16i32_4_3 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16i32_4_05 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 18, i32 19, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8_2_0 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
@@ -369,7 +369,7 @@ define void @multipart() {
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v32idrev = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 15, i32 14, i32 13, i32 12, i32 16, i32 17, i32 18, i32 19, i32 31, i32 30, i32 29, i32 28>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
 ; CHECK-NEXT:  Cost Model: Found costs of 16 for: %v32many2 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> <i32 1, i32 4, i32 8, i32 12, i32 17, i32 20, i32 24, i32 28, i32 2, i32 6, i32 11, i32 14, i32 18, i32 22, i32 27, i32 30>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %v323 = shufflevector <3 x i32> undef, <3 x i32> undef, <3 x i32> <i32 2, i32 3, i32 0>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64a = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v64b = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v64ab = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
@@ -408,10 +408,10 @@ define void @vst3(ptr %p) {
 ; CHECK-LABEL: 'vst3'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
 ; CHECK-NEXT:  Cost Model: Found costs of 24 for: %v32i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
 ; CHECK-NEXT:  Cost Model: Found costs of 48 for: %v64i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:  Cost Model: Found costs of 5 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5>
@@ -452,10 +452,10 @@ define void @vst4(ptr %p) {
 ; CHECK-LABEL: 'vst4'
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:60 Lat:120 SizeLat:120 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v32i8 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i8 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of 32 for: %v32i16 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of 64 for: %v64i16 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> <i32 0, i32 16, i32 32, i32 48, i32 1, i32 17, i32 33, i32 49, i32 2, i32 18, i32 34, i32 50, i32 3, i32 19, i32 35, i32 51, i32 4, i32 20, i32 36, i32 52, i32 5, i32 21, i32 37, i32 53, i32 6, i32 22, i32 38, i32 54, i32 7, i32 23, i32 39, i32 55, i32 8, i32 24, i32 40, i32 56, i32 9, i32 25, i32 41, i32 57, i32 10, i32 26, i32 42, i32 58, i32 11, i32 27, i32 43, i32 59, i32 12, i32 28, i32 44, i32 60, i32 13, i32 29, i32 45, i32 61, i32 14, i32 30, i32 46, i32 62, i32 15, i32 31, i32 47, i32 63>
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
index 09f116f01ec7..4a003a0085c2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-select.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v8i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i8> %tmp0
 ;
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
@@ -14,7 +14,7 @@ define <8 x i8> @sel_v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 
 define <16 x i8> @sel_v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: 'sel_v16i8'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:60 CodeSize:30 Lat:60 SizeLat:60 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <16 x i8> %tmp0
 ;
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -32,7 +32,7 @@ define <4 x i16> @sel_v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 
 define <8 x i16> @sel_v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: 'sel_v8i16'
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x i16> %tmp0
 ;
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index ec84c58bf968..fa889cc12dc4 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -7,15 +7,15 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @vector_insert_extract(<vscale x 4 x i32> %v0, <vscale x 16 x i32> %v1, <16 x i32> %v2) {
 ; CHECK-VSCALE-1-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'vector_insert_extract'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:54 CodeSize:27 Lat:54 SizeLat:54 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %extract_fixed_from_scalable = call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32(<vscale x 4 x i32> %v0, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %insert_fixed_into_scalable = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v16i32(<vscale x 4 x i32> %v0, <16 x i32> %v2, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_scalable_from_scalable = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> %v1, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_scalable_into_scalable = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %v0, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
@@ -44,7 +44,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -56,7 +56,7 @@ define void @vector_insert_extract_idxzero_128b() #1 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -101,7 +101,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -113,7 +113,7 @@ define void @vector_insert_extract_idxzero_256b() #2 {
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 0 for: %extract_legal_fixed_from_scalable = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv16i1_nxv2i1 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv2i1(<vscale x 16 x i1> undef, <vscale x 2 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %extract_nxv4i1_nxv16i1 = call <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1> undef, i64 0)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:42 CodeSize:28 Lat:42 SizeLat:42 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:32 Lat:48 SizeLat:48 for: %extract_v8i1_nxv8i1 = call <8 x i1> @llvm.vector.extract.v8i1.nxv8i1(<vscale x 8 x i1> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %insert_v2f32_nxv2f32 = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.v2f32(<vscale x 2 x float> undef, <2 x float> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:6 Lat:12 SizeLat:12 for: %extract_v4f16_nxv4f16 = call <4 x half> @llvm.vector.extract.v4f16.nxv4f16(<vscale x 4 x half> undef, i64 0)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %insert_nxv2f32_nxv4f32 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> undef, <vscale x 2 x float> undef, i64 0)
@@ -1364,34 +1364,34 @@ define void @match() #3 {
 ; CHECK-VSCALE-1-LABEL: 'match'
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-VSCALE-2-LABEL: 'match'
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'match'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv16i8_v16i8 = call <vscale x 16 x i1> @llvm.experimental.vector.match.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> undef, <vscale x 16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 4 for: %match_nxv8i16_v8i16 = call <vscale x 8 x i1> @llvm.experimental.vector.match.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> undef, <vscale x 8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_nxv4i32_v4i32 = call <vscale x 4 x i1> @llvm.experimental.vector.match.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> undef, <vscale x 4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_nxv2i64_v2i64 = call <vscale x 2 x i1> @llvm.experimental.vector.match.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> undef, <vscale x 2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v16i8_v16i8 = call <16 x i1> @llvm.experimental.vector.match.v16i8.v16i8(<16 x i8> undef, <16 x i8> undef, <16 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 14 for: %match_v8i16_v8i16 = call <8 x i1> @llvm.experimental.vector.match.v8i16.v8i16(<8 x i16> undef, <8 x i16> undef, <8 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:17 Lat:21 SizeLat:21 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:11 CodeSize:9 Lat:11 SizeLat:11 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:29 CodeSize:21 Lat:29 SizeLat:29 for: %match_v4i32_v4i32 = call <4 x i1> @llvm.experimental.vector.match.v4i32.v4i32(<4 x i32> undef, <4 x i32> undef, <4 x i1> undef)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:11 Lat:15 SizeLat:15 for: %match_v2i64_v2i64 = call <2 x i1> @llvm.experimental.vector.match.v2i64.v2i64(<2 x i64> undef, <2 x i64> undef, <2 x i1> undef)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index bdd8540a2c47..e64bce2d9c9e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: opt < %s -mtriple=riscv32 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,ZVE64D
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,ZVE64D
+; RUN: opt < %s -mtriple=riscv64 -mattr=+zvl128b,+zve64x -passes="print<cost-model>" -cost-kind=throughput -intrinsic-cost-strategy=type-based-intrinsic-cost 2>&1 -disable-output | FileCheck %s --check-prefixes=CHECK,ZVE64X
 
 define void @sext() {
 ; CHECK-LABEL: 'sext'
@@ -2021,92 +2022,179 @@ define void @trunc() {
 }
 
 define void @fpext() {
-; CHECK-LABEL: 'fpext'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2f32 = fpext <2 x half> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f64 = fpext <2 x float> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4f32 = fpext <4 x half> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4f64 = fpext <4 x half> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4f64 = fpext <4 x float> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8f32 = fpext <8 x half> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16_v8f64 = fpext <8 x half> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8f64 = fpext <8 x float> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16_v16f32 = fpext <16 x half> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16_v16f64 = fpext <16 x half> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16f64 = fpext <16 x float> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f16_v32f32 = fpext <32 x half> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f16_v32f64 = fpext <32 x half> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32f64 = fpext <32 x float> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64f16_v64f32 = fpext <64 x half> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f16_v64f64 = fpext <64 x half> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64f64 = fpext <64 x float> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128f16_v128f32 = fpext <128 x half> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128f16_v128f64 = fpext <128 x half> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128f64 = fpext <128 x float> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1f32 = fpext <vscale x 1 x half> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1f64 = fpext <vscale x 1 x half> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f64 = fpext <vscale x 1 x float> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1f32.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f16_nxv4f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f16_nxv8f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f16_nxv8f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f16_nxv16f32 = fpext <vscale x 16 x half> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f16_nxv16f64 = fpext <vscale x 16 x half> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16f64 = fpext <vscale x 16 x float> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32f16_nxv32f32 = fpext <vscale x 32 x half> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f16_nxv32f64 = fpext <vscale x 32 x half> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32f64 = fpext <vscale x 32 x float> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64f32 = fpext <vscale x 64 x half> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64f64 = fpext <vscale x 64 x half> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64f64 = fpext <vscale x 64 x float> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64f32.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'fpext'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f16_v2f32 = fpext <2 x half> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f64 = fpext <2 x float> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f16_v4f32 = fpext <4 x half> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f16_v4f64 = fpext <4 x half> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4f64 = fpext <4 x float> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16_v8f32 = fpext <8 x half> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16_v8f64 = fpext <8 x half> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8f64 = fpext <8 x float> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16_v16f32 = fpext <16 x half> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16_v16f64 = fpext <16 x half> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16f64 = fpext <16 x float> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f16_v32f32 = fpext <32 x half> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f16_v32f64 = fpext <32 x half> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32f64 = fpext <32 x float> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64f16_v64f32 = fpext <64 x half> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f16_v64f64 = fpext <64 x half> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64f64 = fpext <64 x float> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128f16_v128f32 = fpext <128 x half> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128f16_v128f64 = fpext <128 x half> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128f64 = fpext <128 x float> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f16_nxv1f32 = fpext <vscale x 1 x half> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f16_nxv1f64 = fpext <vscale x 1 x half> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f64 = fpext <vscale x 1 x float> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1f32.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f16_nxv2f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f16_nxv2f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f16_nxv4f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f16_nxv4f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f16_nxv8f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f16_nxv8f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f16_nxv16f32 = fpext <vscale x 16 x half> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f16_nxv16f64 = fpext <vscale x 16 x half> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16f64 = fpext <vscale x 16 x float> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32f16_nxv32f32 = fpext <vscale x 32 x half> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f16_nxv32f64 = fpext <vscale x 32 x half> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32f64 = fpext <vscale x 32 x float> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64f16_nxv64f32 = fpext <vscale x 64 x half> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64f16_nxv64f64 = fpext <vscale x 64 x half> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64f64 = fpext <vscale x 64 x float> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64f32.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'fpext'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2f32 = fpext <2 x half> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2f64 = fpext <2 x float> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16_v4f32 = fpext <4 x half> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16_v4f64 = fpext <4 x half> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32_v4f64 = fpext <4 x float> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16_v8f32 = fpext <8 x half> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16_v8f64 = fpext <8 x half> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32_v8f64 = fpext <8 x float> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16_v16f32 = fpext <16 x half> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16_v16f64 = fpext <16 x half> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32_v16f64 = fpext <16 x float> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f16_v32f32 = fpext <32 x half> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f16_v32f64 = fpext <32 x half> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f32_v32f64 = fpext <32 x float> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64f16_v64f32 = fpext <64 x half> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64f16_v64f64 = fpext <64 x half> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64f32_v64f64 = fpext <64 x float> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v128f16_v128f32 = fpext <128 x half> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v128f16_v128f64 = fpext <128 x half> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v128f32_v128f64 = fpext <128 x float> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16_nxv1f32 = fpext <vscale x 1 x half> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f16_nxv1f64 = fpext <vscale x 1 x half> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1f64 = fpext <vscale x 1 x float> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1f32.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16_nxv2f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f16_nxv2f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16_nxv4f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f16_nxv4f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16_nxv8f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f16_nxv8f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16_nxv16f32 = fpext <vscale x 16 x half> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f16_nxv16f64 = fpext <vscale x 16 x half> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16f64 = fpext <vscale x 16 x float> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f16_nxv32f32 = fpext <vscale x 32 x half> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f16_nxv32f64 = fpext <vscale x 32 x half> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32f64 = fpext <vscale x 32 x float> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f16_nxv64f32 = fpext <vscale x 64 x half> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f16_nxv64f64 = fpext <vscale x 64 x half> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64f64 = fpext <vscale x 64 x float> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64f32.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f16_v2f32 = fpext <2 x half> undef to <2 x float>
   %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
@@ -2224,92 +2312,179 @@ define void @fpext() {
 }
 
 define void @fptrunc() {
-; CHECK-LABEL: 'fptrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f16 = fptrunc <vscale x 1 x float> undef to <vscale x 1 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1f16 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1f32 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1f32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv1f16 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv1f16 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv1f32 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4f16 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4f16 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4f32 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4f32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8f16 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8f16 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8f32 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8f32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16f16 = fptrunc <vscale x 16 x float> undef to <vscale x 16 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16f16 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16f32 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32f16 = fptrunc <vscale x 32 x float> undef to <vscale x 32 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32f16 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32f32 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32f32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64f16 = fptrunc <vscale x 64 x float> undef to <vscale x 64 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64f16 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x half>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64f32 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64f32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'fptrunc'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1f16 = fptrunc <vscale x 1 x float> undef to <vscale x 1 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1f16 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1f32 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1f32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv1f16 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv1f16 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv1f32 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4f16 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4f16 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4f32 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4f32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8f16 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8f16 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8f32 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8f32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16f16 = fptrunc <vscale x 16 x float> undef to <vscale x 16 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16f16 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16f32 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32f16 = fptrunc <vscale x 32 x float> undef to <vscale x 32 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32f16 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32f32 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32f32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64f16 = fptrunc <vscale x 64 x float> undef to <vscale x 64 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64f16 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x half>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64f32 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64f32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'fptrunc'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1f16 = fptrunc <vscale x 1 x float> undef to <vscale x 1 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1f16 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1f32 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1f32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv1f16 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv1f16 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv1f32 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4f16 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4f16 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4f32 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4f32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8f16 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8f16 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8f32 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8f32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16f16 = fptrunc <vscale x 16 x float> undef to <vscale x 16 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16f16 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16f32 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32f16 = fptrunc <vscale x 32 x float> undef to <vscale x 32 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32f16 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32f32 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32f32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64f16 = fptrunc <vscale x 64 x float> undef to <vscale x 64 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64f16 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x half>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64f32 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64f32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2f16 = fptrunc <2 x float> undef to <2 x half>
   %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
@@ -2427,288 +2602,571 @@ define void @fptrunc() {
 }
 
 define void @fptosi() {
-; CHECK-LABEL: 'fptosi'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'fptosi'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'fptosi'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32_v2i16 = fptosi <2 x float> undef to <2 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i16 = fptosi <2 x double> undef to <2 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32_v2i32 = fptosi <2 x float> undef to <2 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i32 = fptosi <2 x double> undef to <2 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i64 = fptosi <2 x float> undef to <2 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i64 = fptosi <2 x double> undef to <2 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f32_v4i16 = fptosi <4 x float> undef to <4 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f64_v4i16 = fptosi <4 x double> undef to <4 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f32_v4i32 = fptosi <4 x float> undef to <4 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f64_v4i32 = fptosi <4 x double> undef to <4 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f32_v4i64 = fptosi <4 x float> undef to <4 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f64_v4i64 = fptosi <4 x double> undef to <4 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f32_v8i16 = fptosi <8 x float> undef to <8 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f64_v8i16 = fptosi <8 x double> undef to <8 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f32_v8i32 = fptosi <8 x float> undef to <8 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f64_v8i32 = fptosi <8 x double> undef to <8 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f32_v8i64 = fptosi <8 x float> undef to <8 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f64_v8i64 = fptosi <8 x double> undef to <8 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f32_v16i16 = fptosi <16 x float> undef to <16 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f64_v16i16 = fptosi <16 x double> undef to <16 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f32_v16i32 = fptosi <16 x float> undef to <16 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f64_v16i32 = fptosi <16 x double> undef to <16 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f32_v16i64 = fptosi <16 x float> undef to <16 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f64_v16i64 = fptosi <16 x double> undef to <16 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f32_v32i16 = fptosi <32 x float> undef to <32 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f64_v32i16 = fptosi <32 x double> undef to <32 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f32_v32i32 = fptosi <32 x float> undef to <32 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f64_v32i32 = fptosi <32 x double> undef to <32 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32f32_v32i64 = fptosi <32 x float> undef to <32 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32f64_v32i64 = fptosi <32 x double> undef to <32 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f32_v64i16 = fptosi <64 x float> undef to <64 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f64_v64i16 = fptosi <64 x double> undef to <64 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v64f32_v64i32 = fptosi <64 x float> undef to <64 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v64f64_v64i32 = fptosi <64 x double> undef to <64 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64f32_v64i64 = fptosi <64 x float> undef to <64 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64f64_v64i64 = fptosi <64 x double> undef to <64 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %v128f32_v128i16 = fptosi <128 x float> undef to <128 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %v128f64_v128i16 = fptosi <128 x double> undef to <128 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %v128f32_v128i32 = fptosi <128 x float> undef to <128 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %v128f64_v128i32 = fptosi <128 x double> undef to <128 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128f32_v128i64 = fptosi <128 x float> undef to <128 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128f64_v128i64 = fptosi <128 x double> undef to <128 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i16 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i16 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i32 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i32 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i64 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i64 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i16 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i16 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i32 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i32 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i64 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i64 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i16 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i16 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i32 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i32 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i64 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i64 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i16 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i16 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i32 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i32 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i64 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i64 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i16 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i16 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i32 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i32 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i64 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i64 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i16 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i16 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i32 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i32 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i64 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i64 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i16 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i16 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i32 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i32 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i64 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i64 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptosi <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptosi <2 x double> undef to <2 x i8>
@@ -3022,288 +3480,571 @@ define void @fptosi() {
 }
 
 define void @fptoui() {
-; CHECK-LABEL: 'fptoui'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'fptoui'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 54 for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'fptoui'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32_v2i16 = fptoui <2 x float> undef to <2 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i16 = fptoui <2 x double> undef to <2 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f32_v2i32 = fptoui <2 x float> undef to <2 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v2f64_v2i32 = fptoui <2 x double> undef to <2 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f32_v2i64 = fptoui <2 x float> undef to <2 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2f64_v2i64 = fptoui <2 x double> undef to <2 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f32_v4i16 = fptoui <4 x float> undef to <4 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f64_v4i16 = fptoui <4 x double> undef to <4 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f32_v4i32 = fptoui <4 x float> undef to <4 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v4f64_v4i32 = fptoui <4 x double> undef to <4 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f32_v4i64 = fptoui <4 x float> undef to <4 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4f64_v4i64 = fptoui <4 x double> undef to <4 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f32_v8i16 = fptoui <8 x float> undef to <8 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f64_v8i16 = fptoui <8 x double> undef to <8 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f32_v8i32 = fptoui <8 x float> undef to <8 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v8f64_v8i32 = fptoui <8 x double> undef to <8 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f32_v8i64 = fptoui <8 x float> undef to <8 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8f64_v8i64 = fptoui <8 x double> undef to <8 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 79 for instruction: %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f32_v16i16 = fptoui <16 x float> undef to <16 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f64_v16i16 = fptoui <16 x double> undef to <16 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f32_v16i32 = fptoui <16 x float> undef to <16 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v16f64_v16i32 = fptoui <16 x double> undef to <16 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f32_v16i64 = fptoui <16 x float> undef to <16 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16f64_v16i64 = fptoui <16 x double> undef to <16 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 159 for instruction: %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f32_v32i16 = fptoui <32 x float> undef to <32 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f64_v32i16 = fptoui <32 x double> undef to <32 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f32_v32i32 = fptoui <32 x float> undef to <32 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32f64_v32i32 = fptoui <32 x double> undef to <32 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32f32_v32i64 = fptoui <32 x float> undef to <32 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32f64_v32i64 = fptoui <32 x double> undef to <32 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 319 for instruction: %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f32_v64i16 = fptoui <64 x float> undef to <64 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v64f64_v64i16 = fptoui <64 x double> undef to <64 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v64f32_v64i32 = fptoui <64 x float> undef to <64 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v64f64_v64i32 = fptoui <64 x double> undef to <64 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64f32_v64i64 = fptoui <64 x float> undef to <64 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64f64_v64i64 = fptoui <64 x double> undef to <64 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 639 for instruction: %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %v128f32_v128i16 = fptoui <128 x float> undef to <128 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %v128f64_v128i16 = fptoui <128 x double> undef to <128 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %v128f32_v128i32 = fptoui <128 x float> undef to <128 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %v128f64_v128i32 = fptoui <128 x double> undef to <128 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128f32_v128i64 = fptoui <128 x float> undef to <128 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128f64_v128i64 = fptoui <128 x double> undef to <128 x i64>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 511 for instruction: %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 508 for instruction: %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1279 for instruction: %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i16 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i16 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i32 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i32 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i64 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i64 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i16 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i16 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i32 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i32 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i64 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i64 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i16 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i16 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i32 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i32 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i64 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i64 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i16 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i16 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i32 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i32 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i64 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i64 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i16 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i16 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i32 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i32 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i64 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i64 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i16 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i16 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i32 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i32 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i64 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i64 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i16 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i16 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i16>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i32 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i32 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i32>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i64 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i64 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i64>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2f32_v2i8 = fptoui <2 x float> undef to <2 x i8>
   %v2f64_v2i8 = fptoui <2 x double> undef to <2 x i8>
@@ -3617,288 +4358,571 @@ define void @fptoui() {
 }
 
 define void @sitofp() {
-; CHECK-LABEL: 'sitofp'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'sitofp'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'sitofp'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_v2f32 = sitofp <2 x i16> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_v2f64 = sitofp <2 x i16> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_v2f32 = sitofp <2 x i32> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_v2f64 = sitofp <2 x i32> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2f32 = sitofp <2 x i64> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2f64 = sitofp <2 x i64> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16_v4f32 = sitofp <4 x i16> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16_v4f64 = sitofp <4 x i16> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32_v4f32 = sitofp <4 x i32> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32_v4f64 = sitofp <4 x i32> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i64_v4f32 = sitofp <4 x i64> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i64_v4f64 = sitofp <4 x i64> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16_v8f32 = sitofp <8 x i16> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16_v8f64 = sitofp <8 x i16> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32_v8f32 = sitofp <8 x i32> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32_v8f64 = sitofp <8 x i32> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i64_v8f32 = sitofp <8 x i64> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i64_v8f64 = sitofp <8 x i64> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i16_v16f32 = sitofp <16 x i16> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i16_v16f64 = sitofp <16 x i16> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32_v16f32 = sitofp <16 x i32> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32_v16f64 = sitofp <16 x i32> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i64_v16f32 = sitofp <16 x i64> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i64_v16f64 = sitofp <16 x i64> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i16_v32f32 = sitofp <32 x i16> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i16_v32f64 = sitofp <32 x i16> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i32_v32f32 = sitofp <32 x i32> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i32_v32f64 = sitofp <32 x i32> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32i64_v32f32 = sitofp <32 x i64> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32i64_v32f64 = sitofp <32 x i64> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i16_v64f32 = sitofp <64 x i16> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i16_v64f64 = sitofp <64 x i16> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v64i32_v64f32 = sitofp <64 x i32> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v64i32_v64f64 = sitofp <64 x i32> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64i64_v64f32 = sitofp <64 x i64> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64i64_v64f64 = sitofp <64 x i64> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v128i16_v128f32 = sitofp <128 x i16> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v128i16_v128f64 = sitofp <128 x i16> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %v128i32_v128f32 = sitofp <128 x i32> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %v128i32_v128f64 = sitofp <128 x i32> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128i64_v128f32 = sitofp <128 x i64> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128i64_v128f64 = sitofp <128 x i64> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i16_nxv1f32 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i16_nxv1f64 = sitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i32_nxv1f32 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i32_nxv1f64 = sitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64_nxv1f32 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64_nxv1f64 = sitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i16_nxv2f32 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i16_nxv2f64 = sitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32_nxv2f32 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32_nxv2f64 = sitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i64_nxv2f32 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i64_nxv2f64 = sitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16_nxv4f32 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16_nxv4f64 = sitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32_nxv4f32 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32_nxv4f64 = sitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i64_nxv4f32 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i64_nxv4f64 = sitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16_nxv8f32 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16_nxv8f64 = sitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32_nxv8f32 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32_nxv8f64 = sitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i64_nxv8f32 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i64_nxv8f64 = sitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16_nxv16f32 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16_nxv16f64 = sitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i32_nxv16f32 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i32_nxv16f64 = sitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i64_nxv16f32 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i64_nxv16f64 = sitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i16_nxv32f32 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i16_nxv32f64 = sitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i32_nxv32f32 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i32_nxv32f64 = sitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i64_nxv32f32 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i64_nxv32f64 = sitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i16_nxv64f32 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i16_nxv64f64 = sitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i32_nxv64f32 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i32_nxv64f64 = sitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64f32 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64f64 = sitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = sitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = sitofp <2 x i8> undef to <2 x double>
@@ -4212,288 +5236,571 @@ define void @sitofp() {
 }
 
 define void @uitofp() {
-; CHECK-LABEL: 'uitofp'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'uitofp'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 103 for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 67 for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 135 for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'uitofp'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_v2f32 = uitofp <2 x i16> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i16_v2f64 = uitofp <2 x i16> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_v2f32 = uitofp <2 x i32> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i32_v2f64 = uitofp <2 x i32> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2f32 = uitofp <2 x i64> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i64_v2f64 = uitofp <2 x i64> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16_v4f32 = uitofp <4 x i16> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i16_v4f64 = uitofp <4 x i16> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32_v4f32 = uitofp <4 x i32> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i32_v4f64 = uitofp <4 x i32> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i64_v4f32 = uitofp <4 x i64> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i64_v4f64 = uitofp <4 x i64> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16_v8f32 = uitofp <8 x i16> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i16_v8f64 = uitofp <8 x i16> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32_v8f32 = uitofp <8 x i32> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i32_v8f64 = uitofp <8 x i32> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i64_v8f32 = uitofp <8 x i64> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i64_v8f64 = uitofp <8 x i64> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i16_v16f32 = uitofp <16 x i16> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i16_v16f64 = uitofp <16 x i16> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32_v16f32 = uitofp <16 x i32> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i32_v16f64 = uitofp <16 x i32> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i64_v16f32 = uitofp <16 x i64> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i64_v16f64 = uitofp <16 x i64> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i16_v32f32 = uitofp <32 x i16> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i16_v32f64 = uitofp <32 x i16> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i32_v32f32 = uitofp <32 x i32> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i32_v32f64 = uitofp <32 x i32> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32i64_v32f32 = uitofp <32 x i64> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v32i64_v32f64 = uitofp <32 x i64> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i16_v64f32 = uitofp <64 x i16> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i16_v64f64 = uitofp <64 x i16> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v64i32_v64f32 = uitofp <64 x i32> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %v64i32_v64f64 = uitofp <64 x i32> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64i64_v64f32 = uitofp <64 x i64> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v64i64_v64f64 = uitofp <64 x i64> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 126 for instruction: %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v128i16_v128f32 = uitofp <128 x i16> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %v128i16_v128f64 = uitofp <128 x i16> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %v128i32_v128f32 = uitofp <128 x i32> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %v128i32_v128f64 = uitofp <128 x i32> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128i64_v128f32 = uitofp <128 x i64> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %v128i64_v128f64 = uitofp <128 x i64> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 254 for instruction: %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 252 for instruction: %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 248 for instruction: %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i16_nxv1f32 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i16_nxv1f64 = uitofp <vscale x 1 x i16> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i32_nxv1f32 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i32_nxv1f64 = uitofp <vscale x 1 x i32> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64_nxv1f32 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i64_nxv1f64 = uitofp <vscale x 1 x i64> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i16_nxv2f32 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i16_nxv2f64 = uitofp <vscale x 2 x i16> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32_nxv2f32 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i32_nxv2f64 = uitofp <vscale x 2 x i32> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i64_nxv2f32 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i64_nxv2f64 = uitofp <vscale x 2 x i64> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16_nxv4f32 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i16_nxv4f64 = uitofp <vscale x 4 x i16> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32_nxv4f32 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i32_nxv4f64 = uitofp <vscale x 4 x i32> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i64_nxv4f32 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i64_nxv4f64 = uitofp <vscale x 4 x i64> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16_nxv8f32 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i16_nxv8f64 = uitofp <vscale x 8 x i16> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32_nxv8f32 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i32_nxv8f64 = uitofp <vscale x 8 x i32> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i64_nxv8f32 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i64_nxv8f64 = uitofp <vscale x 8 x i64> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16_nxv16f32 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i16_nxv16f64 = uitofp <vscale x 16 x i16> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i32_nxv16f32 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i32_nxv16f64 = uitofp <vscale x 16 x i32> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i64_nxv16f32 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i64_nxv16f64 = uitofp <vscale x 16 x i64> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i16_nxv32f32 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i16_nxv32f64 = uitofp <vscale x 32 x i16> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i32_nxv32f32 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i32_nxv32f64 = uitofp <vscale x 32 x i32> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i64_nxv32f32 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i64_nxv32f64 = uitofp <vscale x 32 x i64> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i16_nxv64f32 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i16_nxv64f64 = uitofp <vscale x 64 x i16> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i32_nxv64f32 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i32_nxv64f64 = uitofp <vscale x 64 x i32> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64f32 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i64_nxv64f64 = uitofp <vscale x 64 x i64> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Invalid cost for instruction: %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2i8_v2f32 = uitofp <2 x i8> undef to <2 x float>
   %v2i8_v2f64 = uitofp <2 x i8> undef to <2 x double>
@@ -4807,32 +6114,59 @@ define void @uitofp() {
 }
 
 define void @oddvec_sizes() {
-; CHECK-LABEL: 'oddvec_sizes'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = sext <3 x i8> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = sext <7 x i8> undef to <7 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = sext <15 x i8> undef to <15 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = zext <3 x i8> undef to <3 x i16>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = zext <7 x i8> undef to <7 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = zext <15 x i8> undef to <15 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = trunc <3 x i32> undef to <3 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = trunc <7 x i32> undef to <7 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = trunc <15 x i32> undef to <15 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = bitcast <3 x i32> undef to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = bitcast <7 x i32> undef to <7 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = bitcast <15 x i32> undef to <15 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = sitofp <3 x i32> undef to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = sitofp <7 x i32> undef to <7 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = sitofp <15 x i32> undef to <15 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = uitofp <3 x i32> undef to <3 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = uitofp <7 x i32> undef to <7 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = uitofp <15 x i32> undef to <15 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = fptosi <3 x float> undef to <3 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = fptosi <7 x float> undef to <7 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %21 = fptosi <15 x float> undef to <15 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = fptoui <3 x float> undef to <3 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = fptoui <7 x float> undef to <7 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = fptoui <15 x float> undef to <15 x i32>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'oddvec_sizes'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = sext <3 x i8> undef to <3 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = sext <7 x i8> undef to <7 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = sext <15 x i8> undef to <15 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = zext <3 x i8> undef to <3 x i16>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = zext <7 x i8> undef to <7 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = zext <15 x i8> undef to <15 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = trunc <3 x i32> undef to <3 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = trunc <7 x i32> undef to <7 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = trunc <15 x i32> undef to <15 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %10 = bitcast <3 x i32> undef to <3 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %11 = bitcast <7 x i32> undef to <7 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %12 = bitcast <15 x i32> undef to <15 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = sitofp <3 x i32> undef to <3 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = sitofp <7 x i32> undef to <7 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = sitofp <15 x i32> undef to <15 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = uitofp <3 x i32> undef to <3 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = uitofp <7 x i32> undef to <7 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = uitofp <15 x i32> undef to <15 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = fptosi <3 x float> undef to <3 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = fptosi <7 x float> undef to <7 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %21 = fptosi <15 x float> undef to <15 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = fptoui <3 x float> undef to <3 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = fptoui <7 x float> undef to <7 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = fptoui <15 x float> undef to <15 x i32>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'oddvec_sizes'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = sext <3 x i8> undef to <3 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = sext <7 x i8> undef to <7 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = sext <15 x i8> undef to <15 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = zext <3 x i8> undef to <3 x i16>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = zext <7 x i8> undef to <7 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = zext <15 x i8> undef to <15 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = trunc <3 x i32> undef to <3 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = trunc <7 x i32> undef to <7 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = trunc <15 x i32> undef to <15 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = bitcast <3 x i32> undef to <3 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %11 = bitcast <7 x i32> undef to <7 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %12 = bitcast <15 x i32> undef to <15 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %13 = sitofp <3 x i32> undef to <3 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %14 = sitofp <7 x i32> undef to <7 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %15 = sitofp <15 x i32> undef to <15 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %16 = uitofp <3 x i32> undef to <3 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %17 = uitofp <7 x i32> undef to <7 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %18 = uitofp <15 x i32> undef to <15 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %19 = fptosi <3 x float> undef to <3 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %20 = fptosi <7 x float> undef to <7 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %21 = fptosi <15 x float> undef to <15 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %22 = fptoui <3 x float> undef to <3 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %23 = fptoui <7 x float> undef to <7 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %24 = fptoui <15 x float> undef to <15 x i32>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   sext <3 x i8> undef to <3 x i16>
   sext <7 x i8> undef to <7 x i32>
@@ -4873,16 +6207,27 @@ define void @oddvec_sizes() {
 ; vector splitting.  We previously crashed on this case due to an
 ; infinite recursion between cast costing and scalarization costing.
 define void @legalization_crash() {
-; CHECK-LABEL: 'legalization_crash'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16700 for instruction: %1 = bitcast <24 x i8> undef to <192 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %2 = trunc <192 x i8> undef to <192 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %3 = zext <192 x i1> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %4 = sext <192 x i1> undef to <192 x i8>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 948 for instruction: %5 = sitofp <192 x i1> undef to <192 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 948 for instruction: %6 = uitofp <192 x i1> undef to <192 x float>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %7 = fptosi <192 x float> undef to <192 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %8 = fptoui <192 x float> undef to <192 x i1>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; ZVE64D-LABEL: 'legalization_crash'
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16700 for instruction: %1 = bitcast <24 x i8> undef to <192 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %2 = trunc <192 x i8> undef to <192 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %3 = zext <192 x i1> undef to <192 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %4 = sext <192 x i1> undef to <192 x i8>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 948 for instruction: %5 = sitofp <192 x i1> undef to <192 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 948 for instruction: %6 = uitofp <192 x i1> undef to <192 x float>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %7 = fptosi <192 x float> undef to <192 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %8 = fptoui <192 x float> undef to <192 x i1>
+; ZVE64D-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVE64X-LABEL: 'legalization_crash'
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16700 for instruction: %1 = bitcast <24 x i8> undef to <192 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %2 = trunc <192 x i8> undef to <192 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %3 = zext <192 x i1> undef to <192 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %4 = sext <192 x i1> undef to <192 x i8>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %5 = sitofp <192 x i1> undef to <192 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %6 = uitofp <192 x i1> undef to <192 x float>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %7 = fptosi <192 x float> undef to <192 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 16892 for instruction: %8 = fptoui <192 x float> undef to <192 x i1>
+; ZVE64X-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   bitcast <24 x i8> undef to <192 x i1>
   trunc <192 x i8> undef to <192 x i1>
@@ -4894,6 +6239,3 @@ define void @legalization_crash() {
   fptoui <192 x float> undef to <192 x i1>
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32: {{.*}}
-; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/bti-ehpad.ll b/llvm/test/CodeGen/AArch64/bti-ehpad.ll
new file mode 100644
index 000000000000..dee4bd3ec8ad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/bti-ehpad.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu %s -o - | FileCheck %s
+
+; Purpose: With BTI enabled, the landing pad (%lpad) begins with an EH_LABEL and the
+; first *executed* instruction is `bti j`. (BTI is inserted *after* the EH label and meta.)
+
+declare i32 @__gxx_personality_v0(...)
+declare void @may_throw()
+
+define void @test() #0 personality ptr @__gxx_personality_v0 {
+; CHECK-LABEL: test:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:    .cfi_personality 156, DW.ref.__gxx_personality_v0
+; CHECK-NEXT:    .cfi_lsda 28, .Lexception0
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    bti c
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:  // %bb.1: // %common.ret
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: // %lpad
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    bti j
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  invoke void @may_throw()
+          to label %ret unwind label %lpad
+
+lpad:
+  landingpad { ptr, i32 } cleanup
+  ret void
+
+ret:
+  ret void
+}
+
+attributes #0 = { noinline "branch-target-enforcement"="true" "target-features"="+bti" }
diff --git a/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
new file mode 100644
index 000000000000..381904f77660
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/late-taildup-computed-goto.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -tail-dup-pred-size=2 -tail-dup-succ-size=2 -o - %s | FileCheck %s
+
+target triple = "arm64-apple-macosx13.0.0"
+
+@opcode.targets = local_unnamed_addr constant [6 x ptr] [ptr blockaddress(@test_interp, %op1.bb), ptr blockaddress(@test_interp, %op6.bb), ptr blockaddress(@test_interp, %loop.header), ptr blockaddress(@test_interp, %op2.bb), ptr blockaddress(@test_interp, %op4.bb), ptr blockaddress(@test_interp, %op5.bb)]
+
+define void @test_interp(ptr %frame, ptr %dst) {
+; CHECK-LABEL: test_interp:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    stp x24, x23, [sp, #-64]! ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #16] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] ; 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #48] ; 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset w30, -8
+; CHECK-NEXT:    .cfi_offset w29, -16
+; CHECK-NEXT:    .cfi_offset w19, -24
+; CHECK-NEXT:    .cfi_offset w20, -32
+; CHECK-NEXT:    .cfi_offset w21, -40
+; CHECK-NEXT:    .cfi_offset w22, -48
+; CHECK-NEXT:    .cfi_offset w23, -56
+; CHECK-NEXT:    .cfi_offset w24, -64
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x21, _opcode.targets@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    add x21, x21, _opcode.targets@PAGEOFF
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    add x8, x21, xzr, lsl #3
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    mov w22, #1 ; =0x1
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp0: ; Block address taken
+; CHECK-NEXT:  LBB0_1: ; %loop.header
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    mov x20, xzr
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp1: ; Block address taken
+; CHECK-NEXT:  LBB0_2: ; %op1.bb
+; CHECK-NEXT:    str xzr, [x19]
+; CHECK-NEXT:  Ltmp2: ; Block address taken
+; CHECK-NEXT:  LBB0_3: ; %op6.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ldr x0, [x20, #-8]!
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    str x22, [x0]
+; CHECK-NEXT:    ldr x8, [x8, #48]
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp3: ; Block address taken
+; CHECK-NEXT:  LBB0_4: ; %op2.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    mov x20, xzr
+; CHECK-NEXT:    str x23, [x19]
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:  Ltmp4: ; Block address taken
+; CHECK-NEXT:  LBB0_5: ; %op4.bb
+; CHECK-NEXT:  Ltmp5: ; Block address taken
+; CHECK-NEXT:  LBB0_6: ; %op5.bb
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    str x23, [x19]
+; CHECK-NEXT:    ldur x8, [x23, #12]
+; CHECK-NEXT:    ldur x9, [x20, #-8]
+; CHECK-NEXT:    add x23, x23, #20
+; CHECK-NEXT:    stp x8, x9, [x20, #-8]
+; CHECK-NEXT:    add x8, x21, x24, lsl #3
+; CHECK-NEXT:    add x20, x20, #8
+; CHECK-NEXT:    add x24, x24, #1
+; CHECK-NEXT:    br x8
+; CHECK-NEXT:    .loh AdrpAdd Lloh0, Lloh1
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %op1.bb ], [ %iv.next, %op2.bb ], [ %iv.next, %op4.bb ], [ %iv.next, %op5.bb ], [ %iv.next, %op6.bb ], [ %iv.next, %loop.header ]
+  %stack.pointer = phi ptr [ %frame, %entry ], [ %stack.8, %op1.bb ], [ null, %op2.bb ], [ %stack.next, %op4.bb ], [ %stack.next.2, %op5.bb ], [ %stack.4, %op6.bb ], [ null, %loop.header ]
+  %next.instr = phi ptr [ null, %entry ], [ %next.instr, %op1.bb ], [ null, %op2.bb ], [ %next.instr.20, %op4.bb ], [ %next.instr.21, %op5.bb ], [ %next.instr, %op6.bb ], [ null, %loop.header ]
+  %iv.next = add i64 %iv, 1
+  %next_op = getelementptr [6 x ptr], ptr @opcode.targets, i64 0, i64 %iv
+  indirectbr ptr %next_op, [label %op1.bb, label %op6.bb, label %loop.header, label %op2.bb, label %op4.bb, label %op5.bb]
+
+op1.bb:
+  store ptr null, ptr %dst, align 8
+  %stack.8 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.0 = load ptr, ptr %stack.8, align 8
+  store i64 1, ptr %l.0, align 8
+  %gep.0 = getelementptr i8, ptr %l.0, i64 8
+  %l.1 = load ptr, ptr %gep.0, align 8
+  %gep.1 = getelementptr i8, ptr %l.1, i64 48
+  %l.2 = load ptr, ptr %gep.1, align 8
+  tail call void %l.2(ptr nonnull %l.0)
+  br label %loop.header
+
+op2.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  br label %loop.header
+
+op4.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  %next.instr.20 = getelementptr i8, ptr %next.instr, i64 20
+  %stack.2 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.3 = load ptr, ptr %stack.2, align 8
+  %next.instr.12 = getelementptr i8, ptr %next.instr, i64 12
+  %next.instr.12.val = load ptr, ptr %next.instr.12, align 2
+  store ptr %next.instr.12.val, ptr %stack.2, align 8
+  store ptr %l.3, ptr %stack.pointer, align 8
+  %stack.next = getelementptr i8, ptr %stack.pointer, i64 8
+  br label %loop.header
+
+op5.bb:
+  store ptr %next.instr, ptr %dst, align 8
+  %next.instr.21 = getelementptr i8, ptr %next.instr, i64 20
+  %stack.3 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.4 = load ptr, ptr %stack.3, align 8
+  %next.instr.2 = getelementptr i8, ptr %next.instr, i64 12
+  %next.instr.2.val = load ptr, ptr %next.instr.2, align 2
+  store ptr %next.instr.2.val, ptr %stack.3, align 8
+  store ptr %l.4, ptr %stack.pointer, align 8
+  %stack.next.2 = getelementptr i8, ptr %stack.pointer, i64 8
+  br label %loop.header
+
+op6.bb:
+  %stack.4 = getelementptr i8, ptr %stack.pointer, i64 -8
+  %l.5 = load ptr, ptr %stack.4, align 8
+  store i64 1, ptr %l.5, align 8
+  %gep.5 = getelementptr i8, ptr %l.5, i64 8
+  %l.6 = load ptr, ptr %gep.5, align 8
+  %gep.6 = getelementptr i8, ptr %l.6, i64 48
+  %l.7 = load ptr, ptr %gep.6, align 8
+  tail call void %l.7(ptr nonnull %l.5)
+  br label %loop.header
+}
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-call.ll b/llvm/test/CodeGen/AArch64/ptrauth-call.ll
index 700686b9f194..fc555a882be2 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-call.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-call.ll
@@ -181,8 +181,9 @@ define void @test_tailcall_omit_mov_x16_x16(ptr %objptr) #0 {
 ; ELF-NEXT:       movk    x8, #6503, lsl #48
 ; ELF-NEXT:       autda   x1, x8
 ; ELF-NEXT:       ldr     x2, [x1]
-; ELF-NEXT:       movk    x1, #54167, lsl #48
-; ELF-NEXT:       braa    x2, x1
+; ELF-NEXT:       mov     x16, x1
+; ELF-NEXT:       movk    x16, #54167, lsl #48
+; ELF-NEXT:       braa    x2, x16
   %vtable.signed = load ptr, ptr %objptr, align 8
   %objptr.int = ptrtoint ptr %objptr to i64
   %vtable.discr = tail call i64 @llvm.ptrauth.blend(i64 %objptr.int, i64 6503)
@@ -213,8 +214,9 @@ define i32 @test_call_omit_extra_moves(ptr %objptr) #0 {
 ; ELF-NEXT:      movk    x9, #6503, lsl #48
 ; ELF-NEXT:      autda   x8, x9
 ; ELF-NEXT:      ldr     x9, [x8]
-; ELF-NEXT:      movk    x8, #34646, lsl #48
-; ELF-NEXT:      blraa   x9, x8
+; ELF-NEXT:      mov     x17, x8
+; ELF-NEXT:      movk    x17, #34646, lsl #48
+; ELF-NEXT:      blraa   x9, x17
 ; ELF-NEXT:      mov     w0, #42
 ; ELF-NEXT:      ldr     x30, [sp], #16
 ; CHECK-NEXT:    ret
@@ -230,6 +232,97 @@ define i32 @test_call_omit_extra_moves(ptr %objptr) #0 {
   ret i32 42
 }
 
+; The second BLRA instruction should not reuse its AddrDisc operand as a scratch register (returned later).
+define i64 @test_call_discr_csr_live(ptr %fnptr, i64 %addr.discr) #0 {
+; ELF-LABEL: test_call_discr_csr_live:
+; ELF-NEXT:    str     x30, [sp, #-32]!
+; ELF-NEXT:    stp     x20, x19, [sp, #16]
+; ELF-DAG:     mov     x[[FNPTR:[0-9]+]], x0
+; ELF-DAG:     mov     x[[ADDR_DISC:[0-9]+]], x1
+; ELF-DAG:     mov     x17, x1
+; ELF-NEXT:    movk    x17, #6503, lsl #48
+; ELF-NEXT:    blraa   x0, x17
+; ELF-NEXT:    mov     x17, x[[ADDR_DISC]]
+; ELF-NEXT:    movk    x17, #6503, lsl #48
+; ELF-NEXT:    blraa   x[[FNPTR]], x17
+; ELF-NEXT:    mov     x0, x[[ADDR_DISC]]
+; ELF-NEXT:    ldp     x20, x19, [sp, #16]
+; ELF-NEXT:    ldr     x30, [sp], #32
+; ELF-NEXT:    ret
+  %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503)
+  tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ]
+  tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ]
+  ret i64 %addr.discr
+}
+
+; The second BLRA instruction may reuse its AddrDisc operand as a scratch register.
+define i64 @test_call_discr_csr_killed(ptr %fnptr, i64 %addr.discr) #0 {
+; ELF-LABEL: test_call_discr_csr_killed:
+; ELF-NEXT:    str     x30, [sp, #-32]!
+; ELF-NEXT:    stp     x20, x19, [sp, #16]
+; ELF-DAG:     mov     x[[FNPTR:[0-9]+]], x0
+; ELF-DAG:     mov     x[[ADDR_DISC:[0-9]+]], x1
+; ELF-DAG:     mov     x17, x1
+; ELF-NEXT:    movk    x17, #6503, lsl #48
+; ELF-NEXT:    blraa   x0, x17
+; ELF-DAG:     mov     x17, x[[ADDR_DISC]]
+; ELF-NEXT:    movk    x17, #6503, lsl #48
+; ELF-NEXT:    blraa   x[[FNPTR]], x17
+; ELF-NEXT:    ldp     x20, x19, [sp, #16]
+; ELF-NEXT:    mov     w0, #42
+; ELF-NEXT:    ldr     x30, [sp], #32
+; ELF-NEXT:    ret
+  %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503)
+  tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ]
+  tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ]
+  ret i64 42
+}
+
+; BLRA instruction should not reuse its AddrDisc operand as a scratch register (function argument).
+define i64 @test_call_discr_arg(ptr %fnptr, i64 %addr.discr) #0 {
+; ELF-LABEL: test_call_discr_arg:
+; ELF-NEXT:    str     x30, [sp, #-16]!
+; ELF-NEXT:    mov     x8, x0
+; ELF-NEXT:    mov     x0, xzr
+; ELF-NEXT:    mov     x17, x1
+; ELF-NEXT:    movk    x17, #6503, lsl #48
+; ELF-NEXT:    blraa   x8, x17
+; ELF-NEXT:    mov     w0, #42
+; ELF-NEXT:    ldr     x30, [sp], #16
+; ELF-NEXT:    ret
+  %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503)
+  tail call void %fnptr(ptr null, i64 %addr.discr) [ "ptrauth"(i32 0, i64 %discr) ]
+  ret i64 42
+}
+
+; BLRA instruction may reuse its AddrDisc operand as a scratch register.
+define i64 @test_call_discr_non_arg(ptr %fnptr, i64 %addr.discr) #0 {
+; ELF-LABEL: test_call_discr_non_arg:
+; ELF-NEXT:    str     x30, [sp, #-16]!
+; ELF-NEXT:    mov     x17, x1
+; ELF-NEXT:    movk    x17, #6503, lsl #48
+; ELF-NEXT:    blraa   x0, x17
+; ELF-NEXT:    mov     w0, #42
+; ELF-NEXT:    ldr     x30, [sp], #16
+; ELF-NEXT:    ret
+  %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503)
+  tail call void %fnptr() [ "ptrauth"(i32 0, i64 %discr) ]
+  ret i64 42
+}
+
+; AUTH_TCRETURN instruction should not reuse its AddrDisc operand as a scratch register (function argument).
+define i64 @test_tailcall_discr_arg(ptr %fnptr, i64 %addr.discr) #0 {
+; ELF-LABEL: test_tailcall_discr_arg:
+; ELF-NEXT:    mov     x2, x0
+; ELF-NEXT:    mov     x0, xzr
+; ELF-NEXT:    mov     x16, x1
+; ELF-NEXT:    movk    x16, #6503, lsl #48
+; ELF-NEXT:    braa    x2, x16
+  %discr = tail call i64 @llvm.ptrauth.blend(i64 %addr.discr, i64 6503)
+  %result = tail call i64 %fnptr(ptr null, i64 %addr.discr) [ "ptrauth"(i32 0, i64 %discr) ]
+  ret i64 %result
+}
+
 define i32 @test_call_ia_arg(ptr %arg0, i64 %arg1) #0 {
 ; DARWIN-LABEL: test_call_ia_arg:
 ; DARWIN-NEXT:    stp x29, x30, [sp, #-16]!
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
index 2634beb4e359..52b38a563200 100644
--- a/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
+++ b/llvm/test/CodeGen/AArch64/ptrauth-intrinsic-auth-resign.ll
@@ -755,5 +755,128 @@ define i64 @test_auth_ia_swapped(i64 %arg, i64 %arg1) {
   ret i64 %tmp
 }
 
+; Authentications should not be speculated, as they crash on failure and it is
+; perfectly correct to dynamically choose the signing schema or whether to
+; perform authentication at all.
+define ptr @auth_speculation(i64 %signed, i1 %cond) {
+; UNCHECKED-LABEL:        auth_speculation:
+; UNCHECKED:                  %bb.0:
+; UNCHECKED-DARWIN-NEXT:        mov     x16, x0
+; UNCHECKED-DARWIN-NEXT:        tbz     w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]]
+; UNCHECKED-DARWIN-NEXT:      %bb.1:
+; UNCHECKED-DARWIN-NEXT:        autdza  x16
+; UNCHECKED-DARWIN-NEXT:        b       [[BB_RETURN:[A-Za-z0-9_.]+]]
+; UNCHECKED-DARWIN-NEXT:      [[BB_ELSE]]:
+; UNCHECKED-DARWIN-NEXT:        autdzb  x16
+; UNCHECKED-DARWIN-NEXT:      [[BB_RETURN]]:
+; UNCHECKED-DARWIN-NEXT:        ldr     x8, [x16]
+; UNCHECKED-ELF-NEXT:           tbz     w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]]
+; UNCHECKED-ELF-NEXT:         %bb.1:
+; UNCHECKED-ELF-NEXT:           autdza  x0
+; UNCHECKED-ELF-NEXT:           b       [[BB_RETURN:[A-Za-z0-9_.]+]]
+; UNCHECKED-ELF-NEXT:         [[BB_ELSE]]:
+; UNCHECKED-ELF-NEXT:           autdzb  x0
+; UNCHECKED-ELF-NEXT:         [[BB_RETURN]]:
+; UNCHECKED-ELF-NEXT:           ldr     x8, [x0]
+; UNCHECKED-NEXT:               ldr     x8, [x8]
+; UNCHECKED-NEXT:               ldr     x8, [x8]
+; UNCHECKED-NEXT:               ldr     x0, [x8]
+; UNCHECKED-NEXT:               ret
+;
+; CHECKED-LABEL:        auth_speculation:
+; CHECKED:                  %bb.0:
+; CHECKED-DARWIN-NEXT:        mov     x16, x0
+; CHECKED-DARWIN-NEXT:        tbz     w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]]
+; CHECKED-DARWIN-NEXT:      %bb.1:
+; CHECKED-DARWIN-NEXT:        autdza  x16
+; CHECKED-DARWIN-NEXT:        b       [[BB_RETURN:[A-Za-z0-9_.]+]]
+; CHECKED-DARWIN-NEXT:      [[BB_ELSE]]:
+; CHECKED-DARWIN-NEXT:        autdzb  x16
+; CHECKED-DARWIN-NEXT:      [[BB_RETURN]]:
+; CHECKED-DARWIN-NEXT:        ldr     x8, [x16]
+; CHECKED-ELF-NEXT:           tbz     w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]]
+; CHECKED-ELF-NEXT:         %bb.1:
+; CHECKED-ELF-NEXT:           autdza  x0
+; CHECKED-ELF-NEXT:           b       [[BB_RETURN:[A-Za-z0-9_.]+]]
+; CHECKED-ELF-NEXT:         [[BB_ELSE]]:
+; CHECKED-ELF-NEXT:           autdzb  x0
+; CHECKED-ELF-NEXT:         [[BB_RETURN]]:
+; CHECKED-ELF-NEXT:           ldr     x8, [x0]
+; CHECKED-NEXT:               ldr     x8, [x8]
+; CHECKED-NEXT:               ldr     x8, [x8]
+; CHECKED-NEXT:               ldr     x0, [x8]
+; CHECKED-NEXT:               ret
+;
+; TRAP-LABEL:        auth_speculation:
+; TRAP:                  %bb.0:
+; TRAP-DARWIN-NEXT:        mov     x16, x0
+; TRAP-DARWIN-NEXT:        tbz     w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]]
+; TRAP-DARWIN-NEXT:      %bb.1:
+; TRAP-DARWIN-NEXT:        autdza  x16
+; TRAP-DARWIN-NEXT:        mov     x17, x16
+; TRAP-DARWIN-NEXT:        xpacd   x17
+; TRAP-DARWIN-NEXT:        cmp     x16, x17
+; TRAP-DARWIN-NEXT:        b.eq    [[L]]auth_success_18
+; TRAP-DARWIN-NEXT:        brk     #0xc472
+; TRAP-DARWIN-NEXT:      [[L]]auth_success_18:
+; TRAP-DARWIN-NEXT:        b       [[BB_RETURN:[A-Za-z0-9_.]+]]
+; TRAP-DARWIN-NEXT:      [[BB_ELSE]]:
+; TRAP-DARWIN-NEXT:        autdzb  x16
+; TRAP-DARWIN-NEXT:        mov     x17, x16
+; TRAP-DARWIN-NEXT:        xpacd   x17
+; TRAP-DARWIN-NEXT:        cmp     x16, x17
+; TRAP-DARWIN-NEXT:        b.eq    [[L]]auth_success_19
+; TRAP-DARWIN-NEXT:        brk     #0xc473
+; TRAP-DARWIN-NEXT:      [[L]]auth_success_19:
+; TRAP-DARWIN-NEXT:      [[BB_RETURN]]:
+; TRAP-DARWIN-NEXT:        ldr     x8, [x16]
+; TRAP-ELF-NEXT:           tbz     w1, #0, [[BB_ELSE:[A-Za-z0-9_.]+]]
+; TRAP-ELF-NEXT:         %bb.1:
+; TRAP-ELF-NEXT:           autdza  x0
+; TRAP-ELF-NEXT:           mov     x8, x0
+; TRAP-ELF-NEXT:           xpacd   x8
+; TRAP-ELF-NEXT:           cmp     x0, x8
+; TRAP-ELF-NEXT:           b.eq    [[L]]auth_success_18
+; TRAP-ELF-NEXT:           brk     #0xc472
+; TRAP-ELF-NEXT:         [[L]]auth_success_18:
+; TRAP-ELF-NEXT:           b       [[BB_RETURN:[A-Za-z0-9_.]+]]
+; TRAP-ELF-NEXT:         [[BB_ELSE]]:
+; TRAP-ELF-NEXT:           autdzb  x0
+; TRAP-ELF-NEXT:           mov     x8, x0
+; TRAP-ELF-NEXT:           xpacd   x8
+; TRAP-ELF-NEXT:           cmp     x0, x8
+; TRAP-ELF-NEXT:           b.eq    [[L]]auth_success_19
+; TRAP-ELF-NEXT:           brk     #0xc473
+; TRAP-ELF-NEXT:         [[L]]auth_success_19:
+; TRAP-ELF-NEXT:         [[BB_RETURN]]:
+; TRAP-ELF-NEXT:           ldr     x8, [x0]
+; TRAP-NEXT:               ldr     x8, [x8]
+; TRAP-NEXT:               ldr     x8, [x8]
+; TRAP-NEXT:               ldr     x0, [x8]
+; TRAP-NEXT:               ret
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  %auted.then = tail call i64 @llvm.ptrauth.auth(i64 %signed, i32 2, i64 0)
+  br label %return
+
+if.else:
+  %auted.else = tail call i64 @llvm.ptrauth.auth(i64 %signed, i32 3, i64 0)
+  br label %return
+
+return:
+  %auted = phi i64 [ %auted.then, %if.then ], [ %auted.else, %if.else ]
+
+  ; A sequence of instructions that is common to both "then" and "else"
+  ; branches and is expensive to duplicate.
+  %ptr.0 = inttoptr i64 %auted to ptr
+  %ptr.1 = load ptr, ptr %ptr.0
+  %ptr.2 = load ptr, ptr %ptr.1
+  %ptr.3 = load ptr, ptr %ptr.2
+  %ptr.4 = load ptr, ptr %ptr.3
+  ret ptr %ptr.4
+}
+
 declare i64 @llvm.ptrauth.auth(i64, i32, i64)
 declare i64 @llvm.ptrauth.resign(i64, i32, i64, i32, i64)
diff --git a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
index 85aa6846cd80..0091469edde9 100644
--- a/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
+++ b/llvm/test/CodeGen/AArch64/sign-return-address-pauth-lr.ll
@@ -507,8 +507,8 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_a_key_bti:
 ; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
+; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:  .Ltmp10:
 ; PAUTHLR-NEXT:    paciasppc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp10
@@ -521,8 +521,8 @@ define i32 @leaf_sign_all_a_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-return-address"="all" "sign-return-address-key"="b_key" "branch-target-enforcement" {
 ; COMPAT-LABEL: leaf_sign_all_b_key_bti:
 ; COMPAT:       // %bb.0:
+; COMPAT-NEXT:   .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #34
-; COMPAT-NEXT:    .cfi_b_key_frame
 ; COMPAT-NEXT:    hint #39
 ; COMPAT-NEXT:    .cfi_negate_ra_state_with_pc
 ; COMPAT-NEXT:  .Ltmp11:
@@ -535,8 +535,8 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ;
 ; V83A-LABEL: leaf_sign_all_b_key_bti:
 ; V83A:       // %bb.0:
-; V83A-NEXT:    hint #34
 ; V83A-NEXT:    .cfi_b_key_frame
+; V83A-NEXT:    hint #34
 ; V83A-NEXT:    hint #39
 ; V83A-NEXT:    .cfi_negate_ra_state_with_pc
 ; V83A-NEXT:  .Ltmp11:
@@ -548,9 +548,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_b_key_bti:
 ; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
 ; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
+; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:  .Ltmp11:
 ; PAUTHLR-NEXT:    pacibsppc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp11
@@ -563,9 +563,9 @@ define i32 @leaf_sign_all_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-r
 define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "sign-return-address"="all" "target-features"="+v8.3a" "sign-return-address-key"="b_key" "branch-target-enforcement" {
 ; CHECK-LABEL: leaf_sign_all_v83_b_key_bti:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    hint #34
-; CHECK-NEXT:    .cfi_b_key_frame
-; CHECK-NEXT:    hint #39
+; CHECK-NEXT:   .cfi_b_key_frame
+; CHECK-NEXT:   hint #34
+; CHECK-NEXT:   hint #39
 ; CHECK-NEXT:    .cfi_negate_ra_state_with_pc
 ; CHECK-NEXT:  .Ltmp12:
 ; CHECK-NEXT:    pacibsp
@@ -576,9 +576,9 @@ define i32 @leaf_sign_all_v83_b_key_bti(i32 %x) "branch-protection-pauth-lr" "si
 ;
 ; PAUTHLR-LABEL: leaf_sign_all_v83_b_key_bti:
 ; PAUTHLR:       // %bb.0:
-; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:    .cfi_b_key_frame
 ; PAUTHLR-NEXT:    .cfi_negate_ra_state_with_pc
+; PAUTHLR-NEXT:    bti c
 ; PAUTHLR-NEXT:  .Ltmp12:
 ; PAUTHLR-NEXT:    pacibsppc
 ; PAUTHLR-NEXT:    adrp x16, .Ltmp12
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 83c9b73c5757..2b16dd0f29ec 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s --check-prefix=EXPAND
 --- |
   ; ModuleID = '<stdin>'
   source_filename = "<stdin>"
@@ -14,13 +14,14 @@
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #2 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #2 { entry: unreachable }
 
   attributes #0 = { nounwind "target-features"="+sve" }
   attributes #1 = { nounwind "target-features"="+sve2p1" }
+  attributes #2 = { nounwind "target-features"="+sve,+sme2" "aarch64_pstate_sm_enabled" }
 
 ...
 ---
@@ -318,10 +319,10 @@ registers:
   - { id: 0, class: zpr2 }
 stack:
 liveins:
-  - { reg: '$z0_z1', virtual-reg: '%0' }
+  - { reg: '$z1_z2', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1
+    liveins: $z1_z2
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr2
     ; CHECK: stack:
@@ -329,12 +330,12 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
 
-    %0:zpr2 = COPY $z0_z1
+    %0:zpr2 = COPY $z1_z2
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -345,7 +346,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1 = COPY %0
+    $z1_z2 = COPY %0
     RET_ReallyLR
 ...
 ---
@@ -439,10 +440,10 @@ registers:
   - { id: 0, class: zpr4 }
 stack:
 liveins:
-  - { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
+  - { reg: '$z1_z2_z3_z4', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1_z2_z3
+    liveins: $z1_z2_z3_z4
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr4
     ; CHECK: stack:
@@ -450,16 +451,16 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: STR_ZXI $z2, $sp, 2
-    ; EXPAND: STR_ZXI $z3, $sp, 3
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
-    ; EXPAND: $z2 = LDR_ZXI $sp, 2
-    ; EXPAND: $z3 = LDR_ZXI $sp, 3
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: STR_ZXI $z3, $sp, 2
+    ; EXPAND: STR_ZXI $z4, $sp, 3
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
+    ; EXPAND: $z3 = LDR_ZXI $sp, 2
+    ; EXPAND: $z4 = LDR_ZXI $sp, 3
 
-    %0:zpr4 = COPY $z0_z1_z2_z3
+    %0:zpr4 = COPY $z1_z2_z3_z4
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -470,7 +471,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1_z2_z3 = COPY %0
+    $z1_z2_z3_z4 = COPY %0
     RET_ReallyLR
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index ae70f91a4ec6..a1d615c91079 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -12,7 +12,7 @@ body:             |
   bb.0:
     liveins: $p0, $z0
 
-    ; CHECK: add_x
+    ; CHECK: name: add_x
     ; CHECK-NOT: MOVPRFX
     ; CHECK: $z0 = FADD_ZPmZ_S renamable $p0, killed $z0, renamable $z0
     ; CHECK-NEXT: RET
@@ -21,22 +21,36 @@ body:             |
 
 ...
 
-# CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
 ---
 name: expand_mls_to_msb
 body:             |
   bb.0:
+    ; CHECK: name: expand_mls_to_msb
+    ; CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
 
-# CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
 ---
 name: expand_mla_to_mad
 body:             |
   bb.0:
+    ; CHECK: name: expand_mla_to_mad
+    ; CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
+
+---
+name: expand_transfer_implicit_defs
+body:             |
+  bb.0:
+    ; CHECK: name: expand_transfer_implicit_defs
+    ; CHECK:      BUNDLE
+    ; CHECK-SAME: implicit-def $z0_z1_z2_z3
+    liveins: $z1, $z2, $p0
+    renamable $z0 = FADD_ZPZZ_D_UNDEF killed $p0, killed $z1, killed $z2, implicit-def $z0_z1_z2_z3
+    RET_ReallyLR implicit $z0_z1_z2_z3
+...
diff --git a/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
index e18ac46165d2..7f40efa34271 100644
--- a/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
+++ b/llvm/test/CodeGen/AArch64/veclib-llvm.sincos.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter "(bl|ptrue)" --version 5
 ; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF
 ; RUN: llc -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL
+; Check we expand to a vector library call on aarch64-amazon-linux:
+; RUN: llc -mtriple=aarch64-amazon-linux -mattr=+neon,+sve -vector-library=sleefgnuabi < %s | FileCheck %s -check-prefix=SLEEF
+; RUN: llc -mtriple=aarch64-amazon-linux -mattr=+neon,+sve -vector-library=ArmPL < %s | FileCheck %s -check-prefix=ARMPL
 
 define void @test_sincos_v4f32(<4 x float> %x, ptr noalias %out_sin, ptr noalias %out_cos) {
 ; SLEEF-LABEL: test_sincos_v4f32:
diff --git a/llvm/test/CodeGen/AArch64/wineh-bti-funclet.ll b/llvm/test/CodeGen/AArch64/wineh-bti-funclet.ll
new file mode 100644
index 000000000000..3a1eaf04af5a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/wineh-bti-funclet.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-windows -mattr=+bti -o - %s | FileCheck %s
+
+declare i32 @__CxxFrameHandler3(...)
+declare void @may_throw()
+
+; Purpose: For WinEH funclets, entry is call-like: accept `bti c` / `hint #34` or a PAC prologue.
+
+define dso_local void @wineh_funclet() #0 personality ptr @__CxxFrameHandler3 {
+; CHECK-LABEL: wineh_funclet:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:  .seh_proc wineh_funclet
+; CHECK-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    bti c
+; CHECK-NEXT:    .seh_nop
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr_x 32
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .seh_set_fp
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    mov x0, #-2 // =0xfffffffffffffffe
+; CHECK-NEXT:    stur x0, [x29, #16]
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:  .LBB0_1: // Block address taken
+; CHECK-NEXT:    // %try.cont
+; CHECK-NEXT:  $ehgcr_0_1:
+; CHECK-NEXT:    bti j
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_fplr_x 32
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_handlerdata
+; CHECK-NEXT:    .word $cppxdata$wineh_funclet@IMGREL
+; CHECK-NEXT:    .text
+; CHECK-NEXT:    .seh_endproc
+; CHECK-NEXT:    .def "?catch$2@?0?wineh_funclet@4HA";
+; CHECK-NEXT:    .scl 3;
+; CHECK-NEXT:    .type 32;
+; CHECK-NEXT:    .endef
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  "?catch$2@?0?wineh_funclet@4HA":
+; CHECK-NEXT:  .seh_proc "?catch$2@?0?wineh_funclet@4HA"
+; CHECK-NEXT:    .seh_handler __CxxFrameHandler3, @unwind, @except
+; CHECK-NEXT:  .LBB0_2: // %catch
+; CHECK-NEXT:    bti c
+; CHECK-NEXT:    .seh_nop
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr_x 16
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    bl may_throw
+; CHECK-NEXT:    adrp x0, .LBB0_1
+; CHECK-NEXT:    add x0, x0, .LBB0_1
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_fplr_x 16
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+entry:
+  invoke void @may_throw()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:
+  %cs = catchswitch within none [label %catch] unwind to caller
+
+catch:
+  %cp = catchpad within %cs [ptr null, i32 0, ptr null]
+  call void @may_throw() ["funclet"(token %cp)]
+  catchret from %cp to label %try.cont
+
+try.cont:
+  ret void
+}
+
+attributes #0 = { noinline "branch-target-enforcement"="true" }
diff --git a/llvm/test/CodeGen/AArch64/wineh-bti.ll b/llvm/test/CodeGen/AArch64/wineh-bti.ll
index 8b8960d37f9e..86555a7f6436 100644
--- a/llvm/test/CodeGen/AArch64/wineh-bti.ll
+++ b/llvm/test/CodeGen/AArch64/wineh-bti.ll
@@ -29,7 +29,7 @@ lbl4:
 
 ; CHECK-LABEL: func:
 ; CHECK-NEXT: .seh_proc func
-; CHECK-NEXT: // %bb.0:
+; CHECK:     // %bb.0: // %entry
 ; CHECK-NEXT: hint #34
 ; CHECK-NEXT: .seh_nop
 ; CHECK-NEXT: str x19, [sp, #-16]!
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index f8e13fcdd227..51398a45055e 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -475,28 +475,21 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -507,7 +500,6 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1043,10 +1035,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -2664,28 +2656,21 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -2696,7 +2681,6 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -3232,10 +3216,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index ba9dd8f7c246..6512bee36e88 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -513,28 +513,21 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -545,7 +538,6 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -1081,10 +1073,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
@@ -1897,28 +1889,21 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_nop 0
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr8
 ; GFX9-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v5
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v4
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_nop 0
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9]
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    s_mov_b64 s[12:13], 0x7f
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[12:13]
-; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[14:15]
-; GFX9-O0-NEXT:    s_mov_b64 s[14:15], s[6:7]
-; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15]
+; GFX9-O0-NEXT:    v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[14:15]
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[8:9]
 ; GFX9-O0-NEXT:    v_and_b32_e64 v6, 1, v6
@@ -1929,7 +1914,6 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v5
 ; GFX9-O0-NEXT:    s_mov_b32 s14, s13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v6, v6, s14
-; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
 ; GFX9-O0-NEXT:    ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13
 ; GFX9-O0-NEXT:    v_xor_b32_e64 v4, v4, s12
 ; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
@@ -2465,10 +2449,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 s[6:7], 1
 ; GFX9-O0-NEXT:    s_mov_b32 s5, s6
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AVR/cmp.ll b/llvm/test/CodeGen/AVR/cmp.ll
index efc9b8da45ba..c932bda1807f 100644
--- a/llvm/test/CodeGen/AVR/cmp.ll
+++ b/llvm/test/CodeGen/AVR/cmp.ll
@@ -298,3 +298,18 @@ define i16 @cmp_i16_gt_1023(i16 %0) {
   %3 = zext i1 %2 to i16
   ret i16 %3
 }
+
+define void @cmp_issue152097(i16 %a) addrspace(1) {
+; See: https://github.com/llvm/llvm-project/issues/152097
+; CHECK-LABEL: cmp_issue152097
+; CHECK:      ldi r18, -1
+; CHECK-NEXT: cpi r24, -2
+; CHECK-NEXT: cpc r25, r18
+; CHECK-NEXT: ret
+  %cmp = icmp ugt i16 -2, %a
+  br i1 %cmp, label %if.then, label %if.else
+if.then:
+  ret void
+if.else:
+  ret void
+}
diff --git a/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll b/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll
index b23366bc11ac..f5430dfea586 100644
--- a/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll
+++ b/llvm/test/CodeGen/Hexagon/hexagon-strcpy.ll
@@ -1,20 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -march=hexagon -verify-machineinstrs  < %s | FileCheck %s
+; RUN: llc -mtriple=hexagon -verify-machineinstrs  < %s | FileCheck %s
 
 @.str = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, 3'RD STRING\00", align 1
 @.str1 = private unnamed_addr constant [3 x i8] c"%s\00", align 1
 
-; Function Attrs: nounwind
 declare i32 @printf(i8* nocapture readonly, ...)
 
 ; Function Attrs: nounwind
-define i32 @main() {
+define i32 @main() nounwind {
 ; CHECK-LABEL: main:
-; CHECK:         .cfi_startproc
-; CHECK-NEXT:  // %bb.0: // %entry
-; CHECK-NEXT:    .cfi_def_cfa r30, 8
-; CHECK-NEXT:    .cfi_offset r31, -4
-; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     r0 = ##.L.str1
 ; CHECK-NEXT:     r3:2 = CONST64(#2325073635944967245)
@@ -53,5 +48,4 @@ entry:
   ret i32 0
 }
 
-; Function Attrs: nounwind
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
diff --git a/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
new file mode 100644
index 000000000000..ff50f1abe589
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/mpy-operand-hoist.ll
@@ -0,0 +1,38 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK-NOT: r{{[0-9]+}} = asr(r{{[0-9]+}},#{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}}:{{[0-9]+}} = mpyu(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NOT: r{{[0-9]+}} += mpyi(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK: r{{[0-9]+}}:{{[0-9]+}} = mpy(r{{[0-9]+}},r{{[0-9]+}})
+
+; ModuleID = '39544.c'
+source_filename = "39544.c"
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @mul_n(i64* nocapture %p, i32* nocapture readonly %a, i32 %k, i32 %n) local_unnamed_addr {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %conv1 = sext i32 %k to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %arrayidx.phi = phi i32* [ %a, %for.body.lr.ph ], [ %arrayidx.inc, %for.body ]
+  %arrayidx2.phi = phi i64* [ %p, %for.body.lr.ph ], [ %arrayidx2.inc, %for.body ]
+  %i.08 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %conv = sext i32 %0 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  store i64 %mul, i64* %arrayidx2.phi, align 8
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx2.inc = getelementptr i64, i64* %arrayidx2.phi, i32 1
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
new file mode 100644
index 000000000000..2960343564fc
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-load-to-store-forward.mir
@@ -0,0 +1,50 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner %s -o /dev/null
+
+# Check that edges that violate topological order are not added to the
+# SwingSchedulerDAG. This is a case where the crash was caused by PR 145878.
+
+--- |
+  target triple = "hexagon"
+  
+  define void @crash_145878() {
+  entry:
+    br label %loop
+  
+  loop:                                             ; preds = %loop, %entry
+    %lsr.iv2 = phi i32 [ %lsr.iv.next, %loop ], [ 1, %entry ]
+    %lsr.iv = phi ptr [ %cgep3, %loop ], [ inttoptr (i32 -8 to ptr), %entry ]
+    %cgep = getelementptr i8, ptr %lsr.iv, i32 12
+    %load = load i32, ptr %cgep, align 4
+    store i32 %load, ptr %lsr.iv, align 4
+    %lsr.iv.next = add nsw i32 %lsr.iv2, -1
+    %iv.cmp.not = icmp eq i32 %lsr.iv.next, 0
+    %cgep3 = getelementptr i8, ptr %lsr.iv, i32 -8
+    br i1 %iv.cmp.not, label %exit, label %loop
+  
+  exit:                                             ; preds = %loop
+    ret void
+  }
+...
+---
+name:            crash_145878
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+  
+    %5:intregs = A2_tfrsi -8
+    J2_loop0i %bb.1, 1, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.1.loop (machine-block-address-taken):
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  
+    %1:intregs = PHI %5, %bb.0, %3, %bb.1
+    %6:intregs = L2_loadri_io %1, 12 :: (load (s32) from %ir.cgep)
+    S2_storeri_io %1, 0, killed %6 :: (store (s32) into %ir.lsr.iv)
+    %3:intregs = A2_addi %1, -8
+    ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+  
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
index d07e2914c753..f7653af1fa9b 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
@@ -122,23 +122,23 @@ define i64 @callee_large_scalars(i256 %a, i256 %b) nounwind {
 define i64 @caller_large_scalars() nounwind {
 ; CHECK-LABEL: caller_large_scalars:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $zero, $sp, 24
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $zero, $sp, 40
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 8
+; CHECK-NEXT:    vst $vr0, $sp, 24
 ; CHECK-NEXT:    ori $a0, $zero, 2
-; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 56
-; CHECK-NEXT:    vst $vr0, $sp, 40
+; CHECK-NEXT:    st.d $a0, $sp, 16
+; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    vst $vr0, $sp, 56
 ; CHECK-NEXT:    ori $a2, $zero, 1
-; CHECK-NEXT:    addi.d $a0, $sp, 32
-; CHECK-NEXT:    addi.d $a1, $sp, 0
-; CHECK-NEXT:    st.d $a2, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 48
+; CHECK-NEXT:    addi.d $a1, $sp, 16
+; CHECK-NEXT:    st.d $a2, $sp, 48
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars(i256 1, i256 2)
   ret i64 %1
@@ -177,20 +177,20 @@ define i64 @callee_large_scalars_exhausted_regs(i64 %a, i64 %b, i64 %c, i64 %d,
 define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-LABEL: caller_large_scalars_exhausted_regs:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $a0, $sp, 16
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    st.d $a0, $sp, 8
 ; CHECK-NEXT:    ori $a0, $zero, 9
 ; CHECK-NEXT:    st.d $a0, $sp, 0
-; CHECK-NEXT:    st.d $zero, $sp, 40
+; CHECK-NEXT:    st.d $zero, $sp, 56
 ; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 24
+; CHECK-NEXT:    vst $vr0, $sp, 40
 ; CHECK-NEXT:    ori $a0, $zero, 10
-; CHECK-NEXT:    st.d $a0, $sp, 16
-; CHECK-NEXT:    st.d $zero, $sp, 72
+; CHECK-NEXT:    st.d $a0, $sp, 32
+; CHECK-NEXT:    st.d $zero, $sp, 88
 ; CHECK-NEXT:    ori $a0, $zero, 8
-; CHECK-NEXT:    st.d $a0, $sp, 48
+; CHECK-NEXT:    st.d $a0, $sp, 64
 ; CHECK-NEXT:    ori $a0, $zero, 1
 ; CHECK-NEXT:    ori $a1, $zero, 2
 ; CHECK-NEXT:    ori $a2, $zero, 3
@@ -198,12 +198,12 @@ define i64 @caller_large_scalars_exhausted_regs() nounwind {
 ; CHECK-NEXT:    ori $a4, $zero, 5
 ; CHECK-NEXT:    ori $a5, $zero, 6
 ; CHECK-NEXT:    ori $a6, $zero, 7
-; CHECK-NEXT:    addi.d $a7, $sp, 48
-; CHECK-NEXT:    vst $vr0, $sp, 56
+; CHECK-NEXT:    addi.d $a7, $sp, 64
+; CHECK-NEXT:    vst $vr0, $sp, 72
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(callee_large_scalars_exhausted_regs)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = call i64 @callee_large_scalars_exhausted_regs(
       i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i256 8, i64 9,
diff --git a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
index c88b67f13d1e..da8c3e93f684 100644
--- a/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-half.ll
@@ -1252,8 +1252,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64F-LP64S-LABEL: caller_half_on_stack:
 ; LA64F-LP64S:       # %bb.0:
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64F-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64F-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64F-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64F-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64F-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1292,8 +1292,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64F-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64F-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64F-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64F-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64F-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64F-LP64S-NEXT:    ret
 ;
 ; LA64F-LP64D-LABEL: caller_half_on_stack:
@@ -1336,8 +1336,8 @@ define i32 @caller_half_on_stack() nounwind {
 ;
 ; LA64D-LP64S-LABEL: caller_half_on_stack:
 ; LA64D-LP64S:       # %bb.0:
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -80
-; LA64D-LP64S-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, -96
+; LA64D-LP64S-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
 ; LA64D-LP64S-NEXT:    lu12i.w $a0, -12
 ; LA64D-LP64S-NEXT:    ori $a1, $a0, 3200
 ; LA64D-LP64S-NEXT:    lu32i.d $a1, 0
@@ -1376,8 +1376,8 @@ define i32 @caller_half_on_stack() nounwind {
 ; LA64D-LP64S-NEXT:    st.w $t0, $sp, 0
 ; LA64D-LP64S-NEXT:    pcaddu18i $ra, %call36(callee_half_on_stack)
 ; LA64D-LP64S-NEXT:    jirl $ra, $ra, 0
-; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 80
+; LA64D-LP64S-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64D-LP64S-NEXT:    addi.d $sp, $sp, 96
 ; LA64D-LP64S-NEXT:    ret
 ;
 ; LA64D-LP64D-LABEL: caller_half_on_stack:
diff --git a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
index 52d8dd05aaa4..1a9de3b0ef3d 100644
--- a/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
+++ b/llvm/test/CodeGen/LoongArch/can-not-realign-stack.ll
@@ -14,41 +14,41 @@
 define dso_local noundef signext i32 @main() nounwind {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -272
-; CHECK-NEXT:    st.d $ra, $sp, 264 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -288
+; CHECK-NEXT:    st.d $ra, $sp, 280 # 8-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_0)
 ; CHECK-NEXT:    xvld $xr0, $a0, %pc_lo12(.LCPI0_0)
-; CHECK-NEXT:    xvst $xr0, $sp, 96 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 112 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_1)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI0_1)
-; CHECK-NEXT:    xvst $xr1, $sp, 64 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr1, $sp, 80 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_2)
 ; CHECK-NEXT:    xvld $xr2, $a0, %pc_lo12(.LCPI0_2)
-; CHECK-NEXT:    xvst $xr2, $sp, 32 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr2, $sp, 48 # 32-byte Folded Spill
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI0_3)
 ; CHECK-NEXT:    xvld $xr3, $a0, %pc_lo12(.LCPI0_3)
-; CHECK-NEXT:    xvst $xr3, $sp, 0 # 32-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvst $xr1, $sp, 168
-; CHECK-NEXT:    xvst $xr2, $sp, 200
-; CHECK-NEXT:    xvst $xr3, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvst $xr3, $sp, 16 # 32-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvst $xr1, $sp, 184
+; CHECK-NEXT:    xvst $xr2, $sp, 216
+; CHECK-NEXT:    xvst $xr3, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(foo)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 96 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 136
-; CHECK-NEXT:    xvld $xr0, $sp, 64 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 168
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 200
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
-; CHECK-NEXT:    xvst $xr0, $sp, 232
-; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    xvld $xr0, $sp, 112 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 152
+; CHECK-NEXT:    xvld $xr0, $sp, 80 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 184
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 216
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 248
+; CHECK-NEXT:    addi.d $a0, $sp, 152
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(bar)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 264 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 272
+; CHECK-NEXT:    ld.d $ra, $sp, 280 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 288
 ; CHECK-NEXT:    ret
 entry:
   %s = alloca %struct.S, align 2
diff --git a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
index ccc5c703e71e..15ac95dfc6c5 100644
--- a/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
+++ b/llvm/test/CodeGen/LoongArch/emergency-spill-slot.ll
@@ -28,12 +28,12 @@ define void @func() {
 ; CHECK-NEXT:    ld.w $a3, $a1, 0
 ; CHECK-NEXT:    ld.w $a2, $a1, 0
 ; CHECK-NEXT:    ld.w $a0, $a1, 0
-; CHECK-NEXT:    st.d $fp, $sp, 0
+; CHECK-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
 ; CHECK-NEXT:    lu12i.w $fp, 1
 ; CHECK-NEXT:    ori $fp, $fp, 12
 ; CHECK-NEXT:    add.d $fp, $sp, $fp
 ; CHECK-NEXT:    st.w $t8, $fp, 0
-; CHECK-NEXT:    ld.d $fp, $sp, 0
+; CHECK-NEXT:    ld.d $fp, $sp, 0 # 8-byte Folded Reload
 ; CHECK-NEXT:    st.w $t8, $a1, 0
 ; CHECK-NEXT:    st.w $t7, $a1, 0
 ; CHECK-NEXT:    st.w $t6, $a1, 0
diff --git a/llvm/test/CodeGen/LoongArch/frame.ll b/llvm/test/CodeGen/LoongArch/frame.ll
index 048703029d8c..b29d8634854f 100644
--- a/llvm/test/CodeGen/LoongArch/frame.ll
+++ b/llvm/test/CodeGen/LoongArch/frame.ll
@@ -1,5 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc --mtriple=loongarch64 -mattr=+d < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,-lsx < %s | FileCheck %s --check-prefixes=CHECK,NOLSX
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx < %s | FileCheck %s --check-prefixes=CHECK,LSX
 
 %struct.key_t = type { i32, [16 x i8] }
 
@@ -7,20 +8,35 @@ declare void @llvm.memset.p0.i64(ptr, i8, i64, i1)
 declare void @test1(ptr)
 
 define i32 @test() nounwind {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -32
-; CHECK-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; CHECK-NEXT:    st.w $zero, $sp, 16
-; CHECK-NEXT:    vrepli.b $vr0, 0
-; CHECK-NEXT:    vst $vr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 4
-; CHECK-NEXT:    pcaddu18i $ra, %call36(test1)
-; CHECK-NEXT:    jirl $ra, $ra, 0
-; CHECK-NEXT:    move $a0, $zero
-; CHECK-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 32
-; CHECK-NEXT:    ret
+; NOLSX-LABEL: test:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -32
+; NOLSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; NOLSX-NEXT:    st.w $zero, $sp, 16
+; NOLSX-NEXT:    st.d $zero, $sp, 8
+; NOLSX-NEXT:    st.d $zero, $sp, 0
+; NOLSX-NEXT:    addi.d $a0, $sp, 4
+; NOLSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; NOLSX-NEXT:    jirl $ra, $ra, 0
+; NOLSX-NEXT:    move $a0, $zero
+; NOLSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; NOLSX-NEXT:    addi.d $sp, $sp, 32
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -32
+; LSX-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
+; LSX-NEXT:    st.w $zero, $sp, 16
+; LSX-NEXT:    vrepli.b $vr0, 0
+; LSX-NEXT:    vst $vr0, $sp, 0
+; LSX-NEXT:    addi.d $a0, $sp, 4
+; LSX-NEXT:    pcaddu18i $ra, %call36(test1)
+; LSX-NEXT:    jirl $ra, $ra, 0
+; LSX-NEXT:    move $a0, $zero
+; LSX-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
+; LSX-NEXT:    addi.d $sp, $sp, 32
+; LSX-NEXT:    ret
   %key = alloca %struct.key_t, align 4
   call void @llvm.memset.p0.i64(ptr %key, i8 0, i64 20, i1 false)
   %1 = getelementptr inbounds %struct.key_t, ptr %key, i64 0, i32 1, i64 0
@@ -98,3 +114,62 @@ define void @test_large_frame_size_1234576() "frame-pointer"="all" {
   %1 = alloca i8, i32 1234567
   ret void
 }
+
+;; Note: will create an emergency spill slot, if (!isInt<7>(StackSize)).
+;; Should involve only one SP-adjusting addi per adjustment.
+;; LSX 112 + 16(emergency solt) = 128
+define void @test_frame_size_112() {
+; NOLSX-LABEL: test_frame_size_112:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -112
+; NOLSX-NEXT:    .cfi_def_cfa_offset 112
+; NOLSX-NEXT:    addi.d $sp, $sp, 112
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_112:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -128
+; LSX-NEXT:    .cfi_def_cfa_offset 128
+; LSX-NEXT:    addi.d $sp, $sp, 128
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 112
+  ret void
+}
+
+;; LSX 128 + 16(emergency solt) = 144
+define void @test_frame_size_128() {
+; NOLSX-LABEL: test_frame_size_128:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -128
+; NOLSX-NEXT:    .cfi_def_cfa_offset 128
+; NOLSX-NEXT:    addi.d $sp, $sp, 128
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_128:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -144
+; LSX-NEXT:    .cfi_def_cfa_offset 144
+; LSX-NEXT:    addi.d $sp, $sp, 144
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 128
+  ret void
+}
+
+;; LSX 144 + 16(emergency solt) = 160
+define void @test_frame_size_144() {
+; NOLSX-LABEL: test_frame_size_144:
+; NOLSX:       # %bb.0:
+; NOLSX-NEXT:    addi.d $sp, $sp, -144
+; NOLSX-NEXT:    .cfi_def_cfa_offset 144
+; NOLSX-NEXT:    addi.d $sp, $sp, 144
+; NOLSX-NEXT:    ret
+;
+; LSX-LABEL: test_frame_size_144:
+; LSX:       # %bb.0:
+; LSX-NEXT:    addi.d $sp, $sp, -160
+; LSX-NEXT:    .cfi_def_cfa_offset 160
+; LSX-NEXT:    addi.d $sp, $sp, 160
+; LSX-NEXT:    ret
+  %1 = alloca i8, i32 144
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
index 402ddb9ad941..5a55b253c77b 100644
--- a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -6,11 +6,11 @@
 define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
 ; CHECK-LABEL: box:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -96
-; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    addi.d $sp, $sp, -112
+; CHECK-NEXT:    .cfi_def_cfa_offset 112
 ; CHECK-NEXT:    slli.d $a2, $a1, 5
 ; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
-; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    addi.d $a2, $sp, 16
 ; CHECK-NEXT:    add.d $a3, $a2, $a1
 ; CHECK-NEXT:    vldx $vr0, $a1, $a2
 ; CHECK-NEXT:    vld $vr1, $a3, 32
@@ -18,7 +18,7 @@ define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 der
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    vst $vr1, $a0, 32
 ; CHECK-NEXT:    vst $vr2, $a0, 16
-; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    addi.d $sp, $sp, 112
 ; CHECK-NEXT:    ret
   %1 = alloca [2 x %Box], align 16
   %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
diff --git a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
index 976924bdca68..6035b8822cef 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/broadcast-load.ll
@@ -18,6 +18,32 @@ define <4 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst) {
   ret <4 x i64> %tmp2
 }
 
+define <16 x i16> @should_not_be_optimized_sext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_sext_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  %tmp1 = sext i8 %tmp to i16
+  %tmp2 = insertelement <16 x i16> zeroinitializer, i16 %tmp1, i32 0
+  %tmp3 = shufflevector <16 x i16> %tmp2, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %tmp3
+}
+
+define <16 x i16> @should_not_be_optimized_zext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_zext_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.bu $a0, $a0, 0
+; CHECK-NEXT:    xvreplgr2vr.h $xr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  %tmp1 = zext i8 %tmp to i16
+  %tmp2 = insertelement <16 x i16> zeroinitializer, i16 %tmp1, i32 0
+  %tmp3 = shufflevector <16 x i16> %tmp2, <16 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %tmp3
+}
+
 define <4 x i64> @xvldrepl_d_unaligned_offset(ptr %ptr) {
 ; CHECK-LABEL: xvldrepl_d_unaligned_offset:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
index 789b51d9b5e5..9528280d181a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fpowi.ll
@@ -6,10 +6,10 @@ declare <8 x float> @llvm.powi.v8f32.i32(<8 x float>, i32)
 define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
@@ -18,79 +18,79 @@ define <8 x float> @powi_v8f32(<8 x float> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 3
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 4
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 4
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 5
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 5
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 6
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 6
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
 ; CHECK-NEXT:    movgr2fr.w $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powisf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.s $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 7
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <8 x float> @llvm.powi.v8f32.i32(<8 x float> %va, i32 %b)
@@ -102,10 +102,10 @@ declare <4 x double> @llvm.powi.v4f64.i32(<4 x double>, i32)
 define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-LABEL: powi_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -80
-; CHECK-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; CHECK-NEXT:    xvst $xr0, $sp, 0 # 32-byte Folded Spill
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    xvst $xr0, $sp, 16 # 32-byte Folded Spill
 ; CHECK-NEXT:    addi.w $fp, $a0, 0
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
@@ -114,39 +114,39 @@ define <4 x double> @powi_v4f64(<4 x double> %va, i32 %b) nounwind {
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 1
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 2
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 2
-; CHECK-NEXT:    xvst $xr0, $sp, 32 # 32-byte Folded Spill
-; CHECK-NEXT:    xvld $xr0, $sp, 0 # 32-byte Folded Reload
+; CHECK-NEXT:    xvst $xr0, $sp, 48 # 32-byte Folded Spill
+; CHECK-NEXT:    xvld $xr0, $sp, 16 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
 ; CHECK-NEXT:    movgr2fr.d $fa0, $a0
 ; CHECK-NEXT:    move $a0, $fp
 ; CHECK-NEXT:    pcaddu18i $ra, %call36(__powidf2)
 ; CHECK-NEXT:    jirl $ra, $ra, 0
 ; CHECK-NEXT:    movfr2gr.d $a0, $fa0
-; CHECK-NEXT:    xvld $xr0, $sp, 32 # 32-byte Folded Reload
+; CHECK-NEXT:    xvld $xr0, $sp, 48 # 32-byte Folded Reload
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 3
-; CHECK-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 80
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %res = call <4 x double> @llvm.powi.v4f64.i32(<4 x double> %va, i32 %b)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
index 04214f5dfa9d..2e1618748688 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/extractelement.ll
@@ -76,21 +76,21 @@ define void @extract_4xdouble(ptr %src, ptr %dst) nounwind {
 define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 0
 ; CHECK-NEXT:    ld.b $a0, $a0, 0
 ; CHECK-NEXT:    st.b $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %e = extractelement <32 x i8> %v, i32 %idx
@@ -101,21 +101,21 @@ define void @extract_32xi8_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 1
 ; CHECK-NEXT:    ld.h $a0, $a0, 0
 ; CHECK-NEXT:    st.h $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %e = extractelement <16 x i16> %v, i32 %idx
@@ -126,21 +126,21 @@ define void @extract_16xi16_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    ld.w $a0, $a0, 0
 ; CHECK-NEXT:    st.w $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %e = extractelement <8 x i32> %v, i32 %idx
@@ -151,21 +151,21 @@ define void @extract_8xi32_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    ld.d $a0, $a0, 0
 ; CHECK-NEXT:    st.d $a0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %e = extractelement <4 x i64> %v, i32 %idx
@@ -176,21 +176,21 @@ define void @extract_4xi64_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fld.s $fa0, $a0, 0
 ; CHECK-NEXT:    fst.s $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %e = extractelement <8 x float> %v, i32 %idx
@@ -201,21 +201,21 @@ define void @extract_8xfloat_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 define void @extract_4xdouble_idx(ptr %src, ptr %dst, i32 %idx) nounwind {
 ; CHECK-LABEL: extract_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fld.d $fa0, $a0, 0
 ; CHECK-NEXT:    fst.d $fa0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %e = extractelement <4 x double> %v, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index 3fdc439e6867..c3d09953fbc4 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -4,18 +4,18 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a0, $sp, 31
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.b $a0, $sp, 63
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <32 x i8> %a, i32 31
@@ -26,18 +26,18 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a0, $sp, 30
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.h $a0, $sp, 62
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b = extractelement <16 x i16> %a, i32 15
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
index 88c3e4367ffa..f2d8dda1850b 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-pair-elements.ll
@@ -4,23 +4,23 @@
 define <32 x i8> @insert_extract_v32i8(<32 x i8> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v32i8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    vpickve2gr.b $a0, $vr0, 15
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.b $a1, $sp, 31
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.b $a1, $sp, 63
 ; CHECK-NEXT:    vinsgr2vr.b $vr0, $a0, 1
 ; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
 ; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
 ; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <32 x i8> %a, i32 15
@@ -33,23 +33,23 @@ entry:
 define <16 x i16> @insert_extract_v16i16(<16 x i16> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v16i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    vpickve2gr.h $a0, $vr0, 7
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    ld.h $a1, $sp, 30
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    ld.h $a1, $sp, 62
 ; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 1
 ; CHECK-NEXT:    xvori.b $xr1, $xr0, 0
 ; CHECK-NEXT:    xvpermi.q $xr1, $xr0, 1
 ; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
 entry:
   %b_lo = extractelement <16 x i16> %a, i32 7
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
index 25106b456d2f..a6c0b332abcb 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insertelement.ll
@@ -116,22 +116,22 @@ define void @insert_4xdouble(ptr %src, ptr %dst, double %in) nounwind {
 define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_32xi8_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 0
 ; CHECK-NEXT:    st.b $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <32 x i8>, ptr %src
   %v_new = insertelement <32 x i8> %v, i8 %in, i32 %idx
@@ -142,22 +142,22 @@ define void @insert_32xi8_idx(ptr %src, ptr %dst, i8 %in, i32 %idx) nounwind {
 define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_16xi16_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 1
 ; CHECK-NEXT:    st.h $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <16 x i16>, ptr %src
   %v_new = insertelement <16 x i16> %v, i16 %in, i32 %idx
@@ -168,22 +168,22 @@ define void @insert_16xi16_idx(ptr %src, ptr %dst, i16 %in, i32 %idx) nounwind {
 define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xi32_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 2
 ; CHECK-NEXT:    st.w $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x i32>, ptr %src
   %v_new = insertelement <8 x i32> %v, i32 %in, i32 %idx
@@ -194,22 +194,22 @@ define void @insert_8xi32_idx(ptr %src, ptr %dst, i32 %in, i32 %idx) nounwind {
 define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xi64_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvst $xr0, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr0, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a3, 4, 3
 ; CHECK-NEXT:    st.d $a2, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x i64>, ptr %src
   %v_new = insertelement <4 x i64> %v, i64 %in, i32 %idx
@@ -220,22 +220,22 @@ define void @insert_4xi64_idx(ptr %src, ptr %dst, i64 %in, i32 %idx) nounwind {
 define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_8xfloat_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 2
 ; CHECK-NEXT:    fst.s $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <8 x float>, ptr %src
   %v_new = insertelement <8 x float> %v, float %in, i32 %idx
@@ -246,22 +246,22 @@ define void @insert_8xfloat_idx(ptr %src, ptr %dst, float %in, i32 %idx) nounwin
 define void @insert_4xdouble_idx(ptr %src, ptr %dst, double %in, i32 %idx) nounwind {
 ; CHECK-LABEL: insert_4xdouble_idx:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi.d $sp, $sp, -64
-; CHECK-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; CHECK-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; CHECK-NEXT:    addi.d $fp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $fp, $sp, 96
 ; CHECK-NEXT:    bstrins.d $sp, $zero, 4, 0
 ; CHECK-NEXT:    xvld $xr1, $a0, 0
-; CHECK-NEXT:    xvst $xr1, $sp, 0
-; CHECK-NEXT:    addi.d $a0, $sp, 0
+; CHECK-NEXT:    xvst $xr1, $sp, 32
+; CHECK-NEXT:    addi.d $a0, $sp, 32
 ; CHECK-NEXT:    bstrins.d $a0, $a2, 4, 3
 ; CHECK-NEXT:    fst.d $fa0, $a0, 0
-; CHECK-NEXT:    xvld $xr0, $sp, 0
+; CHECK-NEXT:    xvld $xr0, $sp, 32
 ; CHECK-NEXT:    xvst $xr0, $a1, 0
-; CHECK-NEXT:    addi.d $sp, $fp, -64
-; CHECK-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; CHECK-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; CHECK-NEXT:    addi.d $sp, $sp, 64
+; CHECK-NEXT:    addi.d $sp, $fp, -96
+; CHECK-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 96
 ; CHECK-NEXT:    ret
   %v = load volatile <4 x double>, ptr %src
   %v_new = insertelement <4 x double> %v, double %in, i32 %idx
diff --git a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
index ffedd7f9e943..648c19d50971 100644
--- a/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
+++ b/llvm/test/CodeGen/LoongArch/llvm.sincos.ll
@@ -347,42 +347,42 @@ define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr0, $vr0, $vr1
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    # kill: def $f0 killed $f0 def $vr0
-; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vpackev.w $vr1, $vr0, $vr1
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
@@ -439,48 +439,48 @@ define { <3 x float>, <3 x float> } @test_sincos_v3f32(<3 x float> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v3f32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -96
-; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -112
+; LA64-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 2
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 72
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 88
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 68
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 84
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.w $vr0, $vr0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sinf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 64
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 80
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 56
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 72
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 52
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    fst.s $fa0, $sp, 68
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0 killed $f0 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cosf)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    fst.s $fa0, $sp, 48
-; LA64-NEXT:    vld $vr0, $sp, 64
-; LA64-NEXT:    vld $vr1, $sp, 48
-; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 96
+; LA64-NEXT:    fst.s $fa0, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 80
+; LA64-NEXT:    vld $vr1, $sp, 64
+; LA64-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 112
 ; LA64-NEXT:    ret
   %result = call { <3 x float>, <3 x float> } @llvm.sincos.v3f32(<3 x float> %a)
   ret { <3 x float>, <3 x float> } %result
@@ -568,44 +568,44 @@ define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
-; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
 ; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
+; LA64-NEXT:    vreplvei.d $vr0, $vr0, 0
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    vreplvei.d $vr0, $vr0, 1
-; LA64-NEXT:    vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(sin)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 1
-; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 48 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
 ; LA64-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; LA64-NEXT:    vst $vr0, $sp, 16 # 16-byte Folded Spill
-; LA64-NEXT:    vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT:    vst $vr0, $sp, 32 # 16-byte Folded Spill
+; LA64-NEXT:    vld $vr0, $sp, 16 # 16-byte Folded Reload
 ; LA64-NEXT:    # kill: def $f0_64 killed $f0_64 killed $vr0
 ; LA64-NEXT:    pcaddu18i $ra, %call36(cos)
 ; LA64-NEXT:    jirl $ra, $ra, 0
 ; LA64-NEXT:    movfr2gr.d $a0, $fa0
-; LA64-NEXT:    vld $vr1, $sp, 16 # 16-byte Folded Reload
+; LA64-NEXT:    vld $vr1, $sp, 32 # 16-byte Folded Reload
 ; LA64-NEXT:    vinsgr2vr.d $vr1, $a0, 1
-; LA64-NEXT:    vld $vr0, $sp, 32 # 16-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    vld $vr0, $sp, 48 # 16-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
@@ -801,17 +801,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ;
 ; LA64-LABEL: test_sincos_v2f128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -80
-; LA64-NEXT:    st.d $ra, $sp, 72 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 64 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s0, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s1, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s2, $sp, 40 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s4, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s5, $sp, 16 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s6, $sp, 8 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s7, $sp, 0 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -96
+; LA64-NEXT:    st.d $ra, $sp, 88 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 80 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s0, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s1, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s2, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s3, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s4, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s5, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s6, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s7, $sp, 16 # 8-byte Folded Spill
 ; LA64-NEXT:    ld.d $fp, $a1, 16
 ; LA64-NEXT:    ld.d $s0, $a1, 24
 ; LA64-NEXT:    ld.d $s1, $a1, 0
@@ -847,17 +847,17 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
 ; LA64-NEXT:    st.d $s6, $s3, 16
 ; LA64-NEXT:    st.d $s5, $s3, 8
 ; LA64-NEXT:    st.d $s4, $s3, 0
-; LA64-NEXT:    ld.d $s7, $sp, 0 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s6, $sp, 8 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s5, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s4, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s3, $sp, 32 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s2, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s1, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $s0, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 64 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 72 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 80
+; LA64-NEXT:    ld.d $s7, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s6, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s5, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s4, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s3, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s2, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s1, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $s0, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 80 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 88 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 96
 ; LA64-NEXT:    ret
   %result = call { <2 x fp128>, <2 x fp128> } @llvm.sincos.v2f128(<2 x fp128> %a)
   ret { <2 x fp128>, <2 x fp128> } %result
diff --git a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
index c46747ef3050..f058acf5e45e 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/broadcast-load.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc --mtriple=loongarch64 -mattr=+lsx < %s | FileCheck %s
 
-; TODO: Load a element and splat it to a vector could be lowerd to vldrepl
-
 ; A load has more than one user shouldn't be lowered to vldrepl
 define <2 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst){
 ; CHECK-LABEL: should_not_be_optimized:
@@ -18,6 +16,32 @@ define <2 x i64> @should_not_be_optimized(ptr %ptr, ptr %dst){
   ret <2 x i64> %tmp2
 }
 
+define <8 x i16> @should_not_be_optimized_sext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_sext_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.b $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  %tmp1 = sext i8 %tmp to i16
+  %tmp2 = insertelement <8 x i16> zeroinitializer, i16 %tmp1, i32 0
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %tmp3
+}
+
+define <8 x i16> @should_not_be_optimized_zext_load(ptr %ptr) {
+; CHECK-LABEL: should_not_be_optimized_zext_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.bu $a0, $a0, 0
+; CHECK-NEXT:    vreplgr2vr.h $vr0, $a0
+; CHECK-NEXT:    ret
+  %tmp = load i8, ptr %ptr
+  %tmp1 = zext i8 %tmp to i16
+  %tmp2 = insertelement <8 x i16> zeroinitializer, i16 %tmp1, i32 0
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %tmp3
+}
+
 define <2 x i64> @vldrepl_d_unaligned_offset(ptr %ptr) {
 ; CHECK-LABEL: vldrepl_d_unaligned_offset:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
new file mode 100644
index 000000000000..96159e5884d3
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/pr146455.ll
@@ -0,0 +1,287 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+lsx --verify-machineinstrs < %s | FileCheck %s
+define void @eliminate_frame_index(<16 x i8> %a) nounwind {
+; CHECK-LABEL: eliminate_frame_index:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -240
+; CHECK-NEXT:    st.d $ra, $sp, 232 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $fp, $sp, 224 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s0, $sp, 216 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s1, $sp, 208 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s2, $sp, 200 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s3, $sp, 192 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s4, $sp, 184 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s5, $sp, 176 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s6, $sp, 168 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s7, $sp, 160 # 8-byte Folded Spill
+; CHECK-NEXT:    st.d $s8, $sp, 152 # 8-byte Folded Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $zero, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $ra, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $tp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $a7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $t8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $fp, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s0, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s1, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s2, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s3, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s4, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s5, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s6, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s7, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    addi.d $s8, $zero, 1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    st.d $a0, $sp, 0 # 8-byte Folded Spill
+; CHECK-NEXT:    addi.d $a0, $sp, 136
+; CHECK-NEXT:    vstelm.b $vr0, $a0, 0, 0
+; CHECK-NEXT:    ld.d $a0, $sp, 0 # 8-byte Folded Reload
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $zero
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $ra
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $tp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $a7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $t8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $fp
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s1
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s2
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s3
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s5
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s6
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s7
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    # reg use $s8
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ld.d $s8, $sp, 152 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s7, $sp, 160 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s6, $sp, 168 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s5, $sp, 176 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s4, $sp, 184 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s3, $sp, 192 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s2, $sp, 200 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s1, $sp, 208 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $s0, $sp, 216 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $fp, $sp, 224 # 8-byte Folded Reload
+; CHECK-NEXT:    ld.d $ra, $sp, 232 # 8-byte Folded Reload
+; CHECK-NEXT:    addi.d $sp, $sp, 240
+; CHECK-NEXT:    ret
+  %s = alloca [16 x i8]
+  %ss = alloca [128 x i8]
+
+  %zero =  call i64 asm sideeffect "addi.d $$zero, $$zero, 1", "={r0}"()
+  %ra =  call i64 asm sideeffect "addi.d $$ra, $$zero, 1", "={r1}"()
+  %tp =  call i64 asm sideeffect "addi.d $$tp, $$zero, 1", "={r2}"()
+  %a0 =  call i64 asm sideeffect "addi.d $$a0, $$zero, 1", "={r4}"()
+  %a1 =  call i64 asm sideeffect "addi.d $$a1, $$zero, 1", "={r5}"()
+  %a2 =  call i64 asm sideeffect "addi.d $$a2, $$zero, 1", "={r6}"()
+  %a3 =  call i64 asm sideeffect "addi.d $$a3, $$zero, 1", "={r7}"()
+  %a4 =  call i64 asm sideeffect "addi.d $$a4, $$zero, 1", "={r8}"()
+  %a5 =  call i64 asm sideeffect "addi.d $$a5, $$zero, 1", "={r9}"()
+  %a6 =  call i64 asm sideeffect "addi.d $$a6, $$zero, 1", "={r10}"()
+  %a7 =  call i64 asm sideeffect "addi.d $$a7, $$zero, 1", "={r11}"()
+  %t0 =  call i64 asm sideeffect "addi.d $$t0, $$zero, 1", "={r12}"()
+  %t1 =  call i64 asm sideeffect "addi.d $$t1, $$zero, 1", "={r13}"()
+  %t2 =  call i64 asm sideeffect "addi.d $$t2, $$zero, 1", "={r14}"()
+  %t3 =  call i64 asm sideeffect "addi.d $$t3, $$zero, 1", "={r15}"()
+  %t4 =  call i64 asm sideeffect "addi.d $$t4, $$zero, 1", "={r16}"()
+  %t5 =  call i64 asm sideeffect "addi.d $$t5, $$zero, 1", "={r17}"()
+  %t6 =  call i64 asm sideeffect "addi.d $$t6, $$zero, 1", "={r18}"()
+  %t7 =  call i64 asm sideeffect "addi.d $$t7, $$zero, 1", "={r19}"()
+  %t8 =  call i64 asm sideeffect "addi.d $$t8, $$zero, 1", "={r20}"()
+  ;; r21 Reserved (Non-allocatable)
+  %s9 =  call i64 asm sideeffect "addi.d $$s9, $$zero, 1", "={r22}"()
+  %s0 =  call i64 asm sideeffect "addi.d $$s0, $$zero, 1", "={r23}"()
+  %s1 =  call i64 asm sideeffect "addi.d $$s1, $$zero, 1", "={r24}"()
+  %s2 =  call i64 asm sideeffect "addi.d $$s2, $$zero, 1", "={r25}"()
+  %s3 =  call i64 asm sideeffect "addi.d $$s3, $$zero, 1", "={r26}"()
+  %s4 =  call i64 asm sideeffect "addi.d $$s4, $$zero, 1", "={r27}"()
+  %s5 =  call i64 asm sideeffect "addi.d $$s5, $$zero, 1", "={r28}"()
+  %s6 =  call i64 asm sideeffect "addi.d $$s6, $$zero, 1", "={r29}"()
+  %s7 =  call i64 asm sideeffect "addi.d $$s7, $$zero, 1", "={r30}"()
+  %s8 =  call i64 asm sideeffect "addi.d $$s8, $$zero, 1", "={r31}"()
+
+  %e = extractelement <16 x i8> %a, i64 0
+
+  store volatile i8 %e, ptr %s
+
+  call void asm sideeffect "# reg use $0", "{r0}"(i64 %zero)
+  call void asm sideeffect "# reg use $0", "{r1}"(i64 %ra)
+  call void asm sideeffect "# reg use $0", "{r2}"(i64 %tp)
+  call void asm sideeffect "# reg use $0", "{r4}"(i64 %a0)
+  call void asm sideeffect "# reg use $0", "{r5}"(i64 %a1)
+  call void asm sideeffect "# reg use $0", "{r6}"(i64 %a2)
+  call void asm sideeffect "# reg use $0", "{r7}"(i64 %a3)
+  call void asm sideeffect "# reg use $0", "{r8}"(i64 %a4)
+  call void asm sideeffect "# reg use $0", "{r9}"(i64 %a5)
+  call void asm sideeffect "# reg use $0", "{r10}"(i64 %a6)
+  call void asm sideeffect "# reg use $0", "{r11}"(i64 %a7)
+  call void asm sideeffect "# reg use $0", "{r12}"(i64 %t0)
+  call void asm sideeffect "# reg use $0", "{r13}"(i64 %t1)
+  call void asm sideeffect "# reg use $0", "{r14}"(i64 %t2)
+  call void asm sideeffect "# reg use $0", "{r15}"(i64 %t3)
+  call void asm sideeffect "# reg use $0", "{r16}"(i64 %t4)
+  call void asm sideeffect "# reg use $0", "{r17}"(i64 %t5)
+  call void asm sideeffect "# reg use $0", "{r18}"(i64 %t6)
+  call void asm sideeffect "# reg use $0", "{r19}"(i64 %t7)
+  call void asm sideeffect "# reg use $0", "{r20}"(i64 %t8)
+  ;; r21 Reserved (Non-allocatable)
+  call void asm sideeffect "# reg use $0", "{r22}"(i64 %s9)
+  call void asm sideeffect "# reg use $0", "{r23}"(i64 %s0)
+  call void asm sideeffect "# reg use $0", "{r24}"(i64 %s1)
+  call void asm sideeffect "# reg use $0", "{r25}"(i64 %s2)
+  call void asm sideeffect "# reg use $0", "{r26}"(i64 %s3)
+  call void asm sideeffect "# reg use $0", "{r27}"(i64 %s4)
+  call void asm sideeffect "# reg use $0", "{r28}"(i64 %s5)
+  call void asm sideeffect "# reg use $0", "{r29}"(i64 %s6)
+  call void asm sideeffect "# reg use $0", "{r30}"(i64 %s7)
+  call void asm sideeffect "# reg use $0", "{r31}"(i64 %s8)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
index 0ee30120f77a..ad57bbf9ee5c 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vmskcond.ll
@@ -588,3 +588,18 @@ define i2 @vmsk_trunc_i64(<2 x i64> %a) {
   %res = bitcast <2 x i1> %y to i2
   ret i2 %res
 }
+
+define i4 @vmsk_eq_allzeros_v4i8(<4 x i8> %a) {
+; CHECK-LABEL: vmsk_eq_allzeros_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vseqi.b $vr0, $vr0, 0
+; CHECK-NEXT:    vilvl.b $vr0, $vr0, $vr0
+; CHECK-NEXT:    vilvl.h $vr0, $vr0, $vr0
+; CHECK-NEXT:    vslli.w $vr0, $vr0, 24
+; CHECK-NEXT:    vmskltz.w $vr0, $vr0
+; CHECK-NEXT:    vpickve2gr.hu $a0, $vr0, 0
+; CHECK-NEXT:    ret
+  %1 = icmp eq <4 x i8> %a, zeroinitializer
+  %2 = bitcast <4 x i1> %1 to i4
+  ret i4 %2
+}
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
index 9f15604fcca6..69995a0721f8 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment-with-variable-sized-objects.ll
@@ -36,15 +36,15 @@ define void @caller(i32 %n) {
 ;
 ; LA64-LABEL: caller:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $s8, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $s8, $sp, 104 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
 ; LA64-NEXT:    .cfi_offset 31, -24
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
 ; LA64-NEXT:    move $s8, $sp
@@ -54,14 +54,14 @@ define void @caller(i32 %n) {
 ; LA64-NEXT:    slli.d $a0, $a0, 4
 ; LA64-NEXT:    sub.d $a0, $sp, $a0
 ; LA64-NEXT:    move $sp, $a0
-; LA64-NEXT:    addi.d $a1, $s8, 0
+; LA64-NEXT:    addi.d $a1, $s8, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $s8, $sp, 40 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $s8, $sp, 104 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, i32 %n
   %2 = alloca i32, align 64
diff --git a/llvm/test/CodeGen/LoongArch/stack-realignment.ll b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
index 0645339358b6..0188884543ad 100644
--- a/llvm/test/CodeGen/LoongArch/stack-realignment.ll
+++ b/llvm/test/CodeGen/LoongArch/stack-realignment.ll
@@ -28,22 +28,22 @@ define void @caller32() {
 ;
 ; LA64-LABEL: caller32:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -32
-; LA64-NEXT:    .cfi_def_cfa_offset 32
-; LA64-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -64
+; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 32
+; LA64-NEXT:    addi.d $fp, $sp, 64
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 4, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 32
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -32
-; LA64-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 32
+; LA64-NEXT:    addi.d $sp, $fp, -64
+; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 64
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 32
   call void @callee(ptr %1)
@@ -102,22 +102,22 @@ define void @caller64() {
 ;
 ; LA64-LABEL: caller64:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
-; LA64-NEXT:    st.d $ra, $sp, 56 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -128
+; LA64-NEXT:    .cfi_def_cfa_offset 128
+; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 64
+; LA64-NEXT:    addi.d $fp, $sp, 128
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 5, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 64
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -64
-; LA64-NEXT:    ld.d $fp, $sp, 48 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 56 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    addi.d $sp, $fp, -128
+; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 128
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 64
   call void @callee(ptr %1)
@@ -176,22 +176,22 @@ define void @caller128() {
 ;
 ; LA64-LABEL: caller128:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -128
-; LA64-NEXT:    .cfi_def_cfa_offset 128
-; LA64-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -256
+; LA64-NEXT:    .cfi_def_cfa_offset 256
+; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 128
+; LA64-NEXT:    addi.d $fp, $sp, 256
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 6, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 128
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -128
-; LA64-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 128
+; LA64-NEXT:    addi.d $sp, $fp, -256
+; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 256
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 128
   call void @callee(ptr %1)
@@ -250,22 +250,22 @@ define void @caller256() {
 ;
 ; LA64-LABEL: caller256:
 ; LA64:       # %bb.0:
-; LA64-NEXT:    addi.d $sp, $sp, -256
-; LA64-NEXT:    .cfi_def_cfa_offset 256
-; LA64-NEXT:    st.d $ra, $sp, 248 # 8-byte Folded Spill
-; LA64-NEXT:    st.d $fp, $sp, 240 # 8-byte Folded Spill
+; LA64-NEXT:    addi.d $sp, $sp, -512
+; LA64-NEXT:    .cfi_def_cfa_offset 512
+; LA64-NEXT:    st.d $ra, $sp, 504 # 8-byte Folded Spill
+; LA64-NEXT:    st.d $fp, $sp, 496 # 8-byte Folded Spill
 ; LA64-NEXT:    .cfi_offset 1, -8
 ; LA64-NEXT:    .cfi_offset 22, -16
-; LA64-NEXT:    addi.d $fp, $sp, 256
+; LA64-NEXT:    addi.d $fp, $sp, 512
 ; LA64-NEXT:    .cfi_def_cfa 22, 0
 ; LA64-NEXT:    bstrins.d $sp, $zero, 7, 0
-; LA64-NEXT:    addi.d $a0, $sp, 0
+; LA64-NEXT:    addi.d $a0, $sp, 256
 ; LA64-NEXT:    pcaddu18i $ra, %call36(callee)
 ; LA64-NEXT:    jirl $ra, $ra, 0
-; LA64-NEXT:    addi.d $sp, $fp, -256
-; LA64-NEXT:    ld.d $fp, $sp, 240 # 8-byte Folded Reload
-; LA64-NEXT:    ld.d $ra, $sp, 248 # 8-byte Folded Reload
-; LA64-NEXT:    addi.d $sp, $sp, 256
+; LA64-NEXT:    addi.d $sp, $fp, -512
+; LA64-NEXT:    ld.d $fp, $sp, 496 # 8-byte Folded Reload
+; LA64-NEXT:    ld.d $ra, $sp, 504 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 512
 ; LA64-NEXT:    ret
   %1 = alloca i8, align 256
   call void @callee(ptr %1)
diff --git a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
index eb656ad94e28..6e9d26ab362d 100644
--- a/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
+++ b/llvm/test/CodeGen/LoongArch/target-abi-from-triple-edge-cases.ll
@@ -24,9 +24,9 @@
 ; NO-WARNING-NOT:  warning: triple-implied ABI conflicts with provided target-abi 'lp64d', using target-abi
 
 ;; Check that ILP32-on-LA64 and LP64-on-LA32 combinations are handled properly.
-; RUN: llc --mtriple=loongarch64 --target-abi=ilp32d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch64-linux-gnu --target-abi=ilp32d --mattr=+d < %s 2>&1 \
 ; RUN:   | FileCheck %s --check-prefixes=LP64D,32ON64
-; RUN: llc --mtriple=loongarch32 --target-abi=lp64d --mattr=+d < %s 2>&1 \
+; RUN: llc --mtriple=loongarch32-linux-gnu --target-abi=lp64d --mattr=+d < %s 2>&1 \
 ; RUN:   | FileCheck %s --check-prefixes=ILP32D,64ON32
 
 ; 32ON64: warning: 32-bit ABIs are not supported for 64-bit targets, ignoring and using triple-implied ABI
@@ -49,12 +49,6 @@
 
 ; LP64D-LP64F-NOF: warning: both target-abi and the triple-implied ABI are invalid, ignoring and using feature-implied ABI
 
-;; Check that triple-implied ABI are invalid, use feature-implied ABI
-; RUN: llc --mtriple=loongarch64 --mattr=-f < %s 2>&1 \
-; RUN:   | FileCheck %s --check-prefixes=LP64S,LP64D-NONE-NOF
-
-; LP64D-NONE-NOF: warning: the triple-implied ABI is invalid, ignoring and using feature-implied ABI
-
 define float @f(float %a) {
 ; ILP32D-LABEL: f:
 ; ILP32D:       # %bb.0:
diff --git a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
index 925fdf3d6064..0d441e66a0c8 100644
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@@ -121,19 +121,19 @@ define void @t3() {
 ;
 ; LA64-LABEL: t3:
 ; LA64:       # %bb.0: # %entry
-; LA64-NEXT:    addi.d $sp, $sp, -64
-; LA64-NEXT:    .cfi_def_cfa_offset 64
+; LA64-NEXT:    addi.d $sp, $sp, -80
+; LA64-NEXT:    .cfi_def_cfa_offset 80
 ; LA64-NEXT:    pcalau12i $a0, %pc_hi20(.L.str)
 ; LA64-NEXT:    addi.d $a0, $a0, %pc_lo12(.L.str)
 ; LA64-NEXT:    ld.h $a1, $a0, 20
 ; LA64-NEXT:    ld.w $a2, $a0, 16
 ; LA64-NEXT:    ld.d $a3, $a0, 8
 ; LA64-NEXT:    ld.d $a0, $a0, 0
-; LA64-NEXT:    st.h $a1, $sp, 20
-; LA64-NEXT:    st.w $a2, $sp, 16
-; LA64-NEXT:    st.d $a3, $sp, 8
-; LA64-NEXT:    st.d $a0, $sp, 0
-; LA64-NEXT:    addi.d $sp, $sp, 64
+; LA64-NEXT:    st.h $a1, $sp, 36
+; LA64-NEXT:    st.w $a2, $sp, 32
+; LA64-NEXT:    st.d $a3, $sp, 24
+; LA64-NEXT:    st.d $a0, $sp, 16
+; LA64-NEXT:    addi.d $sp, $sp, 80
 ; LA64-NEXT:    ret
 entry:
   %msgbuf = alloca [64 x i8], align 1
diff --git a/llvm/test/CodeGen/LoongArch/vararg.ll b/llvm/test/CodeGen/LoongArch/vararg.ll
index 939cd2015c5b..bc4b8a77c7e1 100644
--- a/llvm/test/CodeGen/LoongArch/vararg.ll
+++ b/llvm/test/CodeGen/LoongArch/vararg.ll
@@ -47,7 +47,7 @@ define i64 @va1(ptr %fmt, ...) {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -94,7 +94,7 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a1, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a1, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $a1, $fp, -32
 ; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
 ; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
@@ -112,11 +112,11 @@ define i64 @va1_va_arg(ptr %fmt, ...) nounwind {
 define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-LABEL: va1_va_arg_alloca:
 ; LA64-FPELIM:       # %bb.0:
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -96
-; LA64-FPELIM-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 32
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, -112
+; LA64-FPELIM-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-FPELIM-NEXT:    addi.d $fp, $sp, 48
 ; LA64-FPELIM-NEXT:    move $s0, $a1
 ; LA64-FPELIM-NEXT:    st.d $a7, $fp, 56
 ; LA64-FPELIM-NEXT:    st.d $a6, $fp, 48
@@ -126,7 +126,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    st.d $a2, $fp, 16
 ; LA64-FPELIM-NEXT:    st.d $a1, $fp, 8
 ; LA64-FPELIM-NEXT:    addi.d $a0, $fp, 16
-; LA64-FPELIM-NEXT:    st.d $a0, $fp, -32
+; LA64-FPELIM-NEXT:    st.d $a0, $fp, -40
 ; LA64-FPELIM-NEXT:    addi.d $a0, $a1, 15
 ; LA64-FPELIM-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-FPELIM-NEXT:    sub.d $a0, $sp, $a0
@@ -134,20 +134,20 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-FPELIM-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-FPELIM-NEXT:    jirl $ra, $ra, 0
 ; LA64-FPELIM-NEXT:    move $a0, $s0
-; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -32
-; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 96
+; LA64-FPELIM-NEXT:    addi.d $sp, $fp, -48
+; LA64-FPELIM-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-FPELIM-NEXT:    addi.d $sp, $sp, 112
 ; LA64-FPELIM-NEXT:    ret
 ;
 ; LA64-WITHFP-LABEL: va1_va_arg_alloca:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -96
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 24 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 16 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $s0, $sp, 8 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 32
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 48
 ; LA64-WITHFP-NEXT:    move $s0, $a1
 ; LA64-WITHFP-NEXT:    st.d $a7, $fp, 56
 ; LA64-WITHFP-NEXT:    st.d $a6, $fp, 48
@@ -157,7 +157,7 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    st.d $a2, $fp, 16
 ; LA64-WITHFP-NEXT:    st.d $a1, $fp, 8
 ; LA64-WITHFP-NEXT:    addi.d $a0, $fp, 16
-; LA64-WITHFP-NEXT:    st.d $a0, $fp, -32
+; LA64-WITHFP-NEXT:    st.d $a0, $fp, -40
 ; LA64-WITHFP-NEXT:    addi.d $a0, $a1, 15
 ; LA64-WITHFP-NEXT:    bstrins.d $a0, $zero, 3, 0
 ; LA64-WITHFP-NEXT:    sub.d $a0, $sp, $a0
@@ -165,11 +165,11 @@ define i64 @va1_va_arg_alloca(ptr %fmt, ...) nounwind {
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(notdead)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
 ; LA64-WITHFP-NEXT:    move $a0, $s0
-; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -32
-; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 8 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 16 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 24 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 96
+; LA64-WITHFP-NEXT:    addi.d $sp, $fp, -48
+; LA64-WITHFP-NEXT:    ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
 ; LA64-WITHFP-NEXT:    ret
   %va = alloca ptr, align 8
   call void @llvm.va_start(ptr %va)
@@ -314,10 +314,10 @@ define void @va_aligned_stack_caller() nounwind {
 ;
 ; LA64-WITHFP-LABEL: va_aligned_stack_caller:
 ; LA64-WITHFP:       # %bb.0:
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -112
-; LA64-WITHFP-NEXT:    st.d $ra, $sp, 104 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    st.d $fp, $sp, 96 # 8-byte Folded Spill
-; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 112
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, -128
+; LA64-WITHFP-NEXT:    st.d $ra, $sp, 120 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    st.d $fp, $sp, 112 # 8-byte Folded Spill
+; LA64-WITHFP-NEXT:    addi.d $fp, $sp, 128
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 17
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 48
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 16
@@ -336,23 +336,23 @@ define void @va_aligned_stack_caller() nounwind {
 ; LA64-WITHFP-NEXT:    lu32i.d $a0, 335544
 ; LA64-WITHFP-NEXT:    lu52i.d $a0, $a0, -328
 ; LA64-WITHFP-NEXT:    st.d $a0, $sp, 16
-; LA64-WITHFP-NEXT:    st.d $zero, $fp, -24
+; LA64-WITHFP-NEXT:    st.d $zero, $fp, -40
 ; LA64-WITHFP-NEXT:    vrepli.b $vr0, 0
-; LA64-WITHFP-NEXT:    vst $vr0, $fp, -40
+; LA64-WITHFP-NEXT:    vst $vr0, $fp, -56
 ; LA64-WITHFP-NEXT:    ori $a5, $zero, 1000
 ; LA64-WITHFP-NEXT:    ori $a0, $zero, 1
 ; LA64-WITHFP-NEXT:    ori $a1, $zero, 11
-; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -48
+; LA64-WITHFP-NEXT:    addi.d $a2, $fp, -64
 ; LA64-WITHFP-NEXT:    ori $a3, $zero, 12
 ; LA64-WITHFP-NEXT:    ori $a4, $zero, 13
 ; LA64-WITHFP-NEXT:    ori $a7, $zero, 1
-; LA64-WITHFP-NEXT:    st.d $a5, $fp, -48
+; LA64-WITHFP-NEXT:    st.d $a5, $fp, -64
 ; LA64-WITHFP-NEXT:    move $a6, $zero
 ; LA64-WITHFP-NEXT:    pcaddu18i $ra, %call36(va_aligned_stack_callee)
 ; LA64-WITHFP-NEXT:    jirl $ra, $ra, 0
-; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 96 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 104 # 8-byte Folded Reload
-; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 112
+; LA64-WITHFP-NEXT:    ld.d $fp, $sp, 112 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    ld.d $ra, $sp, 120 # 8-byte Folded Reload
+; LA64-WITHFP-NEXT:    addi.d $sp, $sp, 128
 ; LA64-WITHFP-NEXT:    ret
   %1 = call i32 (i32, ...) @va_aligned_stack_callee(i32 1, i32 11,
     i256 1000, i32 12, i32 13, i128 18446744073709551616, i32 14,
diff --git a/llvm/test/CodeGen/Mips/abiflags-soft-float.ll b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll
new file mode 100644
index 000000000000..01821f2d9b6c
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/abiflags-soft-float.ll
@@ -0,0 +1,12 @@
+; RUN: llc -filetype=obj -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o tmp.o
+; RUN: llvm-readobj -A tmp.o | FileCheck %s -check-prefix=OBJ
+; RUN: llc -filetype=asm -mtriple mipsel-unknown-linux -mcpu=mips32 %s -o - | \
+; RUN: FileCheck %s -check-prefix=ASM
+
+; OBJ: FP ABI: Soft float
+; ASM: .module	softfloat 
+
+define dso_local void @asm_is_null() "use-soft-float"="true" {
+  call void asm sideeffect "", ""()
+  ret void
+}
diff --git a/llvm/test/CodeGen/Mips/nan_lowering.ll b/llvm/test/CodeGen/Mips/nan_lowering.ll
new file mode 100644
index 000000000000..2a11278e14b6
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/nan_lowering.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple=mips-linux-gnu -mattr=-nan2008 < %s | FileCheck %s
+; RUN: llc -mtriple=mips-linux-gnu -mattr=+nan2008 < %s | FileCheck %s
+
+; Make sure that lowering does not corrupt the value of NaN values,
+; regardless of what the NaN mode is.
+
+define float @test1() {
+; CHECK: .4byte 0x7fc00000
+  ret float bitcast (i32 u0x7fc00000 to float)
+}
+
+define float @test2() {
+; CHECK: .4byte 0x7fc00001
+  ret float bitcast (i32 u0x7fc00001 to float)
+}
+
+define float @test3() {
+; CHECK: .4byte 0x7f800000
+  ret float bitcast (i32 u0x7f800000 to float)
+}
+
+define float @test4() {
+; CHECK: .4byte 0x7f800001
+  ret float bitcast (i32 u0x7f800001 to float)
+}
diff --git a/llvm/test/CodeGen/Mips/qnan.ll b/llvm/test/CodeGen/Mips/qnan.ll
deleted file mode 100644
index e5b4aa1b42ee..000000000000
--- a/llvm/test/CodeGen/Mips/qnan.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu < %s -o - | FileCheck %s -check-prefixes=MIPS_Legacy
-; RUN: llc -O3 -mcpu=mips32r2 -mtriple=mips-linux-gnu -mattr=+nan2008 < %s -o - | FileCheck %s -check-prefixes=MIPS_NaN2008
-
-define dso_local float @nan(float noundef %a, float noundef %b) local_unnamed_addr #0 {
-; MIPS_Legacy: $CPI0_0:
-; MIPS_Legacy-NEXT: .4byte  0x7fa00000 # float NaN
-
-; MIPS_NaN2008: $CPI0_0:
-; MIPS_NaN2008-NEXT: .4byte  0x7fc00000 # float NaN
-
-entry:
-  %0 = tail call float @llvm.minimum.f32(float %a, float %b)
-  ret float %0
-}
diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
index d0e2c1817f69..3baefde072be 100644
--- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -45,11 +45,12 @@ define <2 x half> @test_ret_const() #0 {
 define half @test_extract_0(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_extract_0(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 0
@@ -59,12 +60,13 @@ define half @test_extract_0(<2 x half> %a) #0 {
 define half @test_extract_1(<2 x half> %a) #0 {
 ; CHECK-LABEL: test_extract_1(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<3>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs2;
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; CHECK-NEXT:    ret;
   %e = extractelement <2 x half> %a, i32 1
   ret half %e
@@ -80,8 +82,9 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; CHECK-NEXT:    setp.eq.b64 %p1, %rd1, 0;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; CHECK-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; CHECK-NEXT:    ret;
@@ -107,14 +110,16 @@ define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fadd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fadd_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -143,7 +148,8 @@ define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_0_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_0_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -175,7 +181,8 @@ define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fadd_imm_1_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fadd_imm_1_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -207,14 +214,16 @@ define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fsub_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fsub_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fsub_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fsub_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -242,7 +251,8 @@ define <2 x half> @test_fneg(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<8>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fneg_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fneg_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    mov.b32 %r3, 0f00000000;
 ; CHECK-NOF16-NEXT:    sub.rn.f32 %r4, %r3, %r2;
@@ -275,14 +285,16 @@ define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<10>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmul_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmul_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmul_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmul_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    mul.rn.f32 %r5, %r4, %r3;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NOF16-NEXT:    mul.rn.f32 %r8, %r7, %r6;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NOF16-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -299,14 +311,16 @@ define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_fdiv_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fdiv_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NEXT:    div.rn.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -331,10 +345,12 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<18>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_frem_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_frem_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_frem_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_frem_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.f32 %r7, %r6;
@@ -342,8 +358,8 @@ define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    testp.infinite.f32 %p1, %r3;
 ; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r9;
-; CHECK-NEXT:    cvt.f32.f16 %r10, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r11, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r11, %rs3;
 ; CHECK-NEXT:    div.rn.f32 %r12, %r11, %r10;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r13, %r12;
 ; CHECK-NEXT:    neg.f32 %r14, %r13;
@@ -535,11 +551,13 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
 ; CHECK-F16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
-; CHECK-F16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
+; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
 ; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r3, %r4;
-; CHECK-F16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_1];
-; CHECK-F16-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
-; CHECK-F16-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
+; CHECK-F16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-F16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-F16-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-F16-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
 ; CHECK-F16-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; CHECK-F16-NEXT:    ret;
 ;
@@ -550,18 +568,22 @@ define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<9>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_3];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs5;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs6;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
-; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs2, %rs8, %p2;
-; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs1, %rs7, %p1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NOF16-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; CHECK-NOF16-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; CHECK-NOF16-NEXT:    ret;
   %cc = fcmp une <2 x half> %c, %d
@@ -579,11 +601,13 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-F16-NEXT:  // %bb.0:
 ; CHECK-F16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
+; CHECK-F16-NEXT:    ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
+; CHECK-F16-NEXT:    ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
 ; CHECK-F16-NEXT:    setp.neu.f16x2 %p1|%p2, %r1, %r2;
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f32_f16_param_1];
-; CHECK-F16-NEXT:    selp.f32 %r7, %r4, %r6, %p2;
-; CHECK-F16-NEXT:    selp.f32 %r8, %r3, %r5, %p1;
+; CHECK-F16-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-F16-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-F16-NEXT:    selp.f32 %r7, %r6, %r4, %p2;
+; CHECK-F16-NEXT:    selp.f32 %r8, %r5, %r3, %p1;
 ; CHECK-F16-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-F16-NEXT:    ret;
 ;
@@ -595,18 +619,22 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
 ; CHECK-NOF16-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f16_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f32_f16_param_3];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f32_f16_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r6, %r5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs2;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs4;
-; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r8, %r7;
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r9, %r10}, [test_select_cc_f32_f16_param_1];
-; CHECK-NOF16-NEXT:    selp.f32 %r11, %r4, %r10, %p2;
-; CHECK-NOF16-NEXT:    selp.f32 %r12, %r3, %r9, %p1;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b64 %rd2, [test_select_cc_f32_f16_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b64 %rd1, [test_select_cc_f32_f16_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs3;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r4, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs4;
+; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r6, %r5;
+; CHECK-NOF16-NEXT:    mov.b64 {%r7, %r8}, %rd2;
+; CHECK-NOF16-NEXT:    mov.b64 {%r9, %r10}, %rd1;
+; CHECK-NOF16-NEXT:    selp.f32 %r11, %r10, %r8, %p2;
+; CHECK-NOF16-NEXT:    selp.f32 %r12, %r9, %r7, %p1;
 ; CHECK-NOF16-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r11};
 ; CHECK-NOF16-NEXT:    ret;
                                            <2 x half> %c, <2 x half> %d) #0 {
@@ -624,14 +652,18 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_f16_f32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f16_f32_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_f16_f32_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_f16_f32_param_1];
-; CHECK-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
-; CHECK-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_f16_f32_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_f16_f32_param_2];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r5, %r3;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r6, %r4;
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; CHECK-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; CHECK-NEXT:    ret;
                                           <2 x float> %c, <2 x float> %d) #0 {
@@ -664,13 +696,15 @@ define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_une_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_une_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_une_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_une_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.neu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -705,13 +739,15 @@ define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ueq_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ueq_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ueq_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ueq_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.equ.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.equ.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -746,13 +782,15 @@ define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ugt_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ugt_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ugt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ugt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.gtu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.gtu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -787,13 +825,15 @@ define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uge_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uge_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uge_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uge_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.geu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.geu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -828,13 +868,15 @@ define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ult_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ult_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ult_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ult_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.ltu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.ltu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -869,13 +911,15 @@ define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ule_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ule_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ule_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ule_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.leu.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.leu.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -911,13 +955,15 @@ define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_uno_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_uno_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_uno_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_uno_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.nan.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -952,13 +998,15 @@ define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_one_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_one_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_one_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_one_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.ne.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.ne.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -993,13 +1041,15 @@ define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oeq_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oeq_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oeq_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oeq_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.eq.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1034,13 +1084,15 @@ define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ogt_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ogt_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ogt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ogt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.gt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1075,13 +1127,15 @@ define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_oge_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_oge_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_oge_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_oge_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.ge.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.ge.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1116,13 +1170,15 @@ define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_olt_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_olt_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_olt_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_olt_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.lt.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1157,13 +1213,15 @@ define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ole_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ole_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ole_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ole_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.le.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.le.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1198,13 +1256,15 @@ define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fcmp_ord_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fcmp_ord_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fcmp_ord_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fcmp_ord_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NOF16-NEXT:    setp.num.f32 %p1, %r4, %r3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs3;
 ; CHECK-NOF16-NEXT:    setp.num.f32 %p2, %r6, %r5;
 ; CHECK-NOF16-NEXT:    selp.b16 %rs5, -1, 0, %p2;
 ; CHECK-NOF16-NEXT:    st.param.b8 [func_retval0], %rs5;
@@ -1222,7 +1282,8 @@ define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.s32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1239,7 +1300,8 @@ define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.s64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.rzi.s64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1255,7 +1317,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.u32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1272,7 +1335,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.u64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.rzi.u64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1369,16 +1433,17 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.u32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_uitofp_2xi32_fadd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r4, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r7, %r8;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
 ; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
@@ -1411,16 +1476,17 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs1, %r1;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.s32 %rs2, %r2;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_sitofp_2xi32_fadd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs2;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r4, %r5;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs5, %r6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
-; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r7, %r8;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
+; CHECK-NOF16-NEXT:    add.rn.f32 %r9, %r8, %r7;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs6, %r9;
 ; CHECK-NOF16-NEXT:    mov.b32 %r10, {%rs6, %rs5};
 ; CHECK-NOF16-NEXT:    st.param.b32 [func_retval0], %r10;
@@ -1468,7 +1534,8 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xfloat_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xfloat_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cvt.f32.f16 %r3, %rs1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -1485,7 +1552,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f64.f16 %rd1, %rs2;
 ; CHECK-NEXT:    cvt.f64.f16 %rd2, %rs1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -1578,7 +1646,8 @@ define <2 x half> @test_sqrt(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sqrt_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sqrt_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1606,7 +1675,8 @@ define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sin_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sin_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1627,7 +1697,8 @@ define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_cos_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_cos_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1703,17 +1774,20 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fma_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fma_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_fma_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fma_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fma_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fma_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
 ; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
@@ -1740,7 +1814,8 @@ define <2 x half> @test_fabs(<2 x half> %a) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<7>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fabs_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NOF16-NEXT:    abs.f32 %r3, %r2;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs3, %r3;
@@ -1761,14 +1836,16 @@ define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_minnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_minnum_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_minnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_minnum_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    min.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NEXT:    min.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -1785,14 +1862,16 @@ define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<10>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_maxnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_maxnum_param_1];
-; CHECK-NEXT:    cvt.f32.f16 %r3, %rs4;
-; CHECK-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NEXT:    ld.param.b32 %r2, [test_maxnum_param_1];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_maxnum_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.f32.f16 %r3, %rs2;
+; CHECK-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; CHECK-NEXT:    cvt.f32.f16 %r4, %rs4;
 ; CHECK-NEXT:    max.f32 %r5, %r4, %r3;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs5, %r5;
-; CHECK-NEXT:    cvt.f32.f16 %r6, %rs3;
-; CHECK-NEXT:    cvt.f32.f16 %r7, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r6, %rs1;
+; CHECK-NEXT:    cvt.f32.f16 %r7, %rs3;
 ; CHECK-NEXT:    max.f32 %r8, %r7, %r6;
 ; CHECK-NEXT:    cvt.rn.f16.f32 %rs6, %r8;
 ; CHECK-NEXT:    mov.b32 %r9, {%rs6, %rs5};
@@ -1822,13 +1901,15 @@ define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<3>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_param_1];
-; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs4, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs2, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs5;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs3, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs1, 32767;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs5, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs1, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs4, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs7};
 ; CHECK-NOF16-NEXT:    ret;
@@ -1844,8 +1925,9 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-F16-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-F16-EMPTY:
 ; CHECK-F16-NEXT:  // %bb.0:
+; CHECK-F16-NEXT:    ld.param.b64 %rd1, [test_copysign_f32_param_1];
 ; CHECK-F16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
-; CHECK-F16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
+; CHECK-F16-NEXT:    mov.b64 {%r2, %r3}, %rd1;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs1, %r3;
 ; CHECK-F16-NEXT:    cvt.rn.f16.f32 %rs2, %r2;
 ; CHECK-F16-NEXT:    mov.b32 %r4, {%rs2, %rs1};
@@ -1862,8 +1944,10 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f32_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b64 %rd1, [test_copysign_f32_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f32_param_0];
+; CHECK-NOF16-NEXT:    mov.b64 {%r2, %r3}, %rd1;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    and.b32 %r4, %r3, -2147483648;
 ; CHECK-NOF16-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs4}, %r4; }
@@ -1906,7 +1990,8 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
 ; CHECK-NOF16-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_copysign_f64_param_1];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_f64_param_0];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_f64_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs2, 32767;
 ; CHECK-NOF16-NEXT:    and.b64 %rd3, %rd2, -9223372036854775808;
 ; CHECK-NOF16-NEXT:    shr.u64 %rd4, %rd3, 48;
@@ -1948,13 +2033,15 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<5>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_copysign_extended_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_copysign_extended_param_1];
-; CHECK-NOF16-NEXT:    and.b16 %rs5, %rs3, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs1, 32767;
-; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs5;
-; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs4, -32768;
-; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs2, 32767;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_copysign_extended_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_copysign_extended_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NOF16-NEXT:    and.b16 %rs3, %rs1, -32768;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs4, %rs5}, %r1;
+; CHECK-NOF16-NEXT:    and.b16 %rs6, %rs4, 32767;
+; CHECK-NOF16-NEXT:    or.b16 %rs7, %rs6, %rs3;
+; CHECK-NOF16-NEXT:    and.b16 %rs8, %rs2, -32768;
+; CHECK-NOF16-NEXT:    and.b16 %rs9, %rs5, 32767;
 ; CHECK-NOF16-NEXT:    or.b16 %rs10, %rs9, %rs8;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r3, %rs10;
 ; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs7;
@@ -1972,7 +2059,8 @@ define <2 x half> @test_floor(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_floor_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_floor_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rmi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rmi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -1988,7 +2076,8 @@ define <2 x half> @test_ceil(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_ceil_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_ceil_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rpi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rpi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2004,7 +2093,8 @@ define <2 x half> @test_trunc(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_trunc_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_trunc_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rzi.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rzi.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2020,7 +2110,8 @@ define <2 x half> @test_rint(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_rint_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_rint_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2036,7 +2127,8 @@ define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_nearbyint_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_nearbyint_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2052,7 +2144,8 @@ define <2 x half> @test_roundeven(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_roundeven_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_roundeven_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rni.f16.f16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2070,7 +2163,8 @@ define <2 x half> @test_round(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<21>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_round_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_round_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.f32.f16 %r2, %rs2;
 ; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
 ; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
@@ -2121,17 +2215,20 @@ define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
 ; CHECK-NOF16-NEXT:    .reg .b32 %r<13>;
 ; CHECK-NOF16-EMPTY:
 ; CHECK-NOF16-NEXT:  // %bb.0:
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmuladd_param_0];
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmuladd_param_2];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs4;
-; CHECK-NOF16-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_fmuladd_param_1];
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs6;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs2;
+; CHECK-NOF16-NEXT:    ld.param.b32 %r3, [test_fmuladd_param_2];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r2, [test_fmuladd_param_1];
+; CHECK-NOF16-NEXT:    ld.param.b32 %r1, [test_fmuladd_param_0];
+; CHECK-NOF16-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r4, %rs2;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs3, %rs4}, %r2;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r5, %rs4;
+; CHECK-NOF16-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r6, %rs6;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r7, %r6, %r5, %r4;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs7, %r7;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs3;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs5;
-; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r8, %rs1;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r9, %rs3;
+; CHECK-NOF16-NEXT:    cvt.f32.f16 %r10, %rs5;
 ; CHECK-NOF16-NEXT:    fma.rn.f32 %r11, %r10, %r9, %r8;
 ; CHECK-NOF16-NEXT:    cvt.rn.f16.f32 %rs8, %r11;
 ; CHECK-NOF16-NEXT:    mov.b32 %r12, {%rs8, %rs7};
@@ -2148,7 +2245,8 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; CHECK-NEXT:    ret;
   %s = shufflevector <2 x half> %a, <2 x half> undef, <2 x i32> <i32 1, i32 0>
@@ -2158,12 +2256,13 @@ define <2 x half> @test_shufflevector(<2 x half> %a) #0 {
 define <2 x half> @test_insertelement(<2 x half> %a, half %x) #0 {
 ; CHECK-LABEL: test_insertelement(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b16 %rs<3>;
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; CHECK-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; CHECK-NEXT:    ret;
   %i = insertelement <2 x half> %a, half %x, i64 1
@@ -2177,7 +2276,8 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.s16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -2193,7 +2293,8 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_uitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs3, %rs2;
 ; CHECK-NEXT:    cvt.rn.f16.u16 %rs4, %rs1;
 ; CHECK-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
index af3cb63082e7..da9e2d8cba13 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-instructions.ll
@@ -28,29 +28,53 @@ define <2 x float> @test_ret_const() #0 {
 }
 
 define float @test_extract_0(<2 x float> %a) #0 {
-; CHECK-LABEL: test_extract_0(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_0_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
-; CHECK-NEXT:    ret;
+; CHECK-NOF32X2-LABEL: test_extract_0(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-NOF32X2-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NOF32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_extract_0(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_extract_0_param_0];
+; CHECK-F32X2-NEXT:    mov.b64 {%r1, _}, %rd1;
+; CHECK-F32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-F32X2-NEXT:    ret;
   %e = extractelement <2 x float> %a, i32 0
   ret float %e
 }
 
 define float @test_extract_1(<2 x float> %a) #0 {
-; CHECK-LABEL: test_extract_1(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<3>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_extract_1_param_0];
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r2;
-; CHECK-NEXT:    ret;
+; CHECK-NOF32X2-LABEL: test_extract_1(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-NOF32X2-NEXT:    { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
+; CHECK-NOF32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_extract_1(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<2>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_extract_1_param_0];
+; CHECK-F32X2-NEXT:    mov.b64 {_, %r1}, %rd1;
+; CHECK-F32X2-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-F32X2-NEXT:    ret;
   %e = extractelement <2 x float> %a, i32 1
   ret float %e
 }
@@ -70,10 +94,12 @@ define <2 x float> @test_fadd(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fadd_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fadd_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -98,7 +124,8 @@ define <2 x float> @test_fadd_imm_0(<2 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_0_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -128,7 +155,8 @@ define <2 x float> @test_fadd_imm_1(<2 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_1_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -158,13 +186,17 @@ define <4 x float> @test_fadd_v4(<4 x float> %a, <4 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r9, %r4, %r8;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r10, %r3, %r7;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r11, %r2, %r6;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r12, %r1, %r5;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r7, %r8}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r9, %r10}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r11, %r10, %r8;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r12, %r9, %r7;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_v4(
@@ -189,12 +221,14 @@ define <4 x float> @test_fadd_imm_0_v4(<4 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4(
@@ -225,12 +259,14 @@ define <4 x float> @test_fadd_imm_1_v4(<4 x float> %a) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4(
@@ -261,10 +297,12 @@ define <2 x float> @test_fsub(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fsub_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fsub_param_1];
-; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fsub_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fsub_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    sub.rn.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -289,7 +327,8 @@ define <2 x float> @test_fneg(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fneg_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fneg_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    neg.f32 %r3, %r2;
 ; CHECK-NEXT:    neg.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -305,10 +344,12 @@ define <2 x float> @test_fmul(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmul_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmul_param_1];
-; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fmul_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fmul_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    mul.rn.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -333,10 +374,12 @@ define <2 x float> @test_fdiv(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fdiv_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fdiv_param_1];
-; CHECK-NEXT:    div.rn.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    div.rn.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fdiv_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fdiv_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    div.rn.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = fdiv <2 x float> %a, %b
@@ -351,20 +394,22 @@ define <2 x float> @test_frem(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_frem_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_frem_param_1];
-; CHECK-NEXT:    div.rn.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_frem_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_frem_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.f32 %r7, %r6;
-; CHECK-NEXT:    fma.rn.f32 %r8, %r7, %r4, %r2;
-; CHECK-NEXT:    testp.infinite.f32 %p1, %r4;
-; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p1;
-; CHECK-NEXT:    div.rn.f32 %r10, %r1, %r3;
+; CHECK-NEXT:    fma.rn.f32 %r8, %r7, %r2, %r4;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r2;
+; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-NEXT:    div.rn.f32 %r10, %r3, %r1;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r11, %r10;
 ; CHECK-NEXT:    neg.f32 %r12, %r11;
-; CHECK-NEXT:    fma.rn.f32 %r13, %r12, %r3, %r1;
-; CHECK-NEXT:    testp.infinite.f32 %p2, %r3;
-; CHECK-NEXT:    selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NEXT:    fma.rn.f32 %r13, %r12, %r1, %r3;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r1;
+; CHECK-NEXT:    selp.f32 %r14, %r3, %r13, %p2;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r9};
 ; CHECK-NEXT:    ret;
   %r = frem <2 x float> %a, %b
@@ -378,10 +423,12 @@ define <2 x float> @test_fadd_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fadd_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fadd_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -406,7 +453,8 @@ define <2 x float> @test_fadd_imm_0_ftz(<2 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_0_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -436,7 +484,8 @@ define <2 x float> @test_fadd_imm_1_ftz(<2 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fadd_imm_1_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40000000;
 ; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f3F800000;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -466,13 +515,17 @@ define <4 x float> @test_fadd_v4_ftz(<4 x float> %a, <4 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_fadd_v4_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r9, %r4, %r8;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r10, %r3, %r7;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r11, %r2, %r6;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r12, %r1, %r5;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r10, %r9};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_fadd_v4_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, %r1;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r7, %r8}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r9, %r10}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r11, %r10, %r8;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r12, %r9, %r7;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r12, %r11, %r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_v4_ftz(
@@ -497,12 +550,14 @@ define <4 x float> @test_fadd_imm_0_v4_ftz(<4 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_0_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_0_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_0_v4_ftz(
@@ -533,12 +588,14 @@ define <4 x float> @test_fadd_imm_1_v4_ftz(<4 x float> %a) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_fadd_imm_1_v4_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r5, %r4, 0f40800000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r6, %r3, 0f40400000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r2, 0f40000000;
-; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r1, 0f3F800000;
-; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; CHECK-NOF32X2-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_fadd_imm_1_v4_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r3, %r2, 0f40800000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r4, %r1, 0f40400000;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r7, %r6, 0f40000000;
+; CHECK-NOF32X2-NEXT:    add.rn.ftz.f32 %r8, %r5, 0f3F800000;
+; CHECK-NOF32X2-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r4, %r3};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
 ; CHECK-F32X2-LABEL: test_fadd_imm_1_v4_ftz(
@@ -569,10 +626,12 @@ define <2 x float> @test_fsub_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fsub_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fsub_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fsub_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fsub_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    sub.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -597,7 +656,8 @@ define <2 x float> @test_fneg_ftz(<2 x float> %a) #2 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fneg_ftz_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fneg_ftz_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    neg.ftz.f32 %r3, %r2;
 ; CHECK-NEXT:    neg.ftz.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -613,10 +673,12 @@ define <2 x float> @test_fmul_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmul_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmul_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fmul_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fmul_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    mul.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -641,11 +703,14 @@ define <2 x float> @test_fma_ftz(<2 x float> %a, <2 x float> %b, <2 x float> %c)
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fma_ftz_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fma_ftz_param_1];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fma_ftz_param_2];
-; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r7, %r2, %r4, %r6;
-; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd3, [test_fma_ftz_param_2];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fma_ftz_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fma_ftz_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r7, %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    fma.rn.ftz.f32 %r8, %r5, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -671,10 +736,12 @@ define <2 x float> @test_fdiv_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fdiv_ftz_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fdiv_ftz_param_1];
-; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    div.rn.ftz.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fdiv_ftz_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fdiv_ftz_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    div.rn.ftz.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = fdiv <2 x float> %a, %b
@@ -689,20 +756,22 @@ define <2 x float> @test_frem_ftz(<2 x float> %a, <2 x float> %b) #2 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_frem_ftz_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_frem_ftz_param_1];
-; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r2, %r4;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_frem_ftz_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_frem_ftz_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r6, %r5;
 ; CHECK-NEXT:    neg.ftz.f32 %r7, %r6;
-; CHECK-NEXT:    fma.rn.ftz.f32 %r8, %r7, %r4, %r2;
-; CHECK-NEXT:    testp.infinite.f32 %p1, %r4;
-; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p1;
-; CHECK-NEXT:    div.rn.ftz.f32 %r10, %r1, %r3;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r8, %r7, %r2, %r4;
+; CHECK-NEXT:    testp.infinite.f32 %p1, %r2;
+; CHECK-NEXT:    selp.f32 %r9, %r4, %r8, %p1;
+; CHECK-NEXT:    div.rn.ftz.f32 %r10, %r3, %r1;
 ; CHECK-NEXT:    cvt.rzi.ftz.f32.f32 %r11, %r10;
 ; CHECK-NEXT:    neg.ftz.f32 %r12, %r11;
-; CHECK-NEXT:    fma.rn.ftz.f32 %r13, %r12, %r3, %r1;
-; CHECK-NEXT:    testp.infinite.f32 %p2, %r3;
-; CHECK-NEXT:    selp.f32 %r14, %r1, %r13, %p2;
+; CHECK-NEXT:    fma.rn.ftz.f32 %r13, %r12, %r1, %r3;
+; CHECK-NEXT:    testp.infinite.f32 %p2, %r1;
+; CHECK-NEXT:    selp.f32 %r14, %r3, %r13, %p2;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r9};
 ; CHECK-NEXT:    ret;
   %r = frem <2 x float> %a, %b
@@ -877,14 +946,18 @@ define <2 x float> @test_select_cc(<2 x float> %a, <2 x float> %b, <2 x float> %
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r5;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r6;
-; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [test_select_cc_param_1];
-; CHECK-NEXT:    selp.f32 %r9, %r2, %r8, %p2;
-; CHECK-NEXT:    selp.f32 %r10, %r1, %r7, %p1;
+; CHECK-NEXT:    ld.param.b64 %rd4, [test_select_cc_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd3, [test_select_cc_param_2];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r1;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT:    selp.f32 %r9, %r8, %r6, %p2;
+; CHECK-NEXT:    selp.f32 %r10, %r7, %r5, %p1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r10, %r9};
 ; CHECK-NEXT:    ret;
   %cc = fcmp une <2 x float> %c, %d
@@ -902,10 +975,12 @@ define <2 x double> @test_select_cc_f64_f32(<2 x double> %a, <2 x double> %b, <2
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f64_f32_param_1];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [test_select_cc_f64_f32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f64_f32_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f64_f32_param_3];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r1, %r3;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r2, %r4;
+; CHECK-NEXT:    ld.param.b64 %rd6, [test_select_cc_f64_f32_param_3];
+; CHECK-NEXT:    ld.param.b64 %rd5, [test_select_cc_f64_f32_param_2];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd6;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd5;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r3, %r1;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r4, %r2;
 ; CHECK-NEXT:    selp.f64 %rd7, %rd2, %rd4, %p2;
 ; CHECK-NEXT:    selp.f64 %rd8, %rd1, %rd3, %p1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd8, %rd7};
@@ -925,12 +1000,14 @@ define <2 x float> @test_select_cc_f32_f64(<2 x float> %a, <2 x float> %b, <2 x
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [test_select_cc_f32_f64_param_3];
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [test_select_cc_f32_f64_param_2];
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_f32_f64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_select_cc_f32_f64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_select_cc_f32_f64_param_0];
 ; CHECK-NEXT:    setp.neu.f64 %p1, %rd3, %rd5;
 ; CHECK-NEXT:    setp.neu.f64 %p2, %rd4, %rd6;
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_f32_f64_param_1];
-; CHECK-NEXT:    selp.f32 %r5, %r2, %r4, %p2;
-; CHECK-NEXT:    selp.f32 %r6, %r1, %r3, %p1;
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    selp.f32 %r5, %r4, %r2, %p2;
+; CHECK-NEXT:    selp.f32 %r6, %r3, %r1, %p1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %cc = fcmp une <2 x double> %c, %d
@@ -947,10 +1024,12 @@ define <2 x i1> @test_fcmp_une(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_une_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_une_param_1];
-; CHECK-NEXT:    setp.neu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.neu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_une_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_une_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.neu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.neu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -969,10 +1048,12 @@ define <2 x i1> @test_fcmp_ueq(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ueq_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ueq_param_1];
-; CHECK-NEXT:    setp.equ.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.equ.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ueq_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ueq_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.equ.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.equ.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -991,10 +1072,12 @@ define <2 x i1> @test_fcmp_ugt(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ugt_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ugt_param_1];
-; CHECK-NEXT:    setp.gtu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.gtu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ugt_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ugt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.gtu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.gtu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1013,10 +1096,12 @@ define <2 x i1> @test_fcmp_uge(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uge_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uge_param_1];
-; CHECK-NEXT:    setp.geu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.geu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_uge_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_uge_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.geu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.geu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1035,10 +1120,12 @@ define <2 x i1> @test_fcmp_ult(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ult_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ult_param_1];
-; CHECK-NEXT:    setp.ltu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.ltu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ult_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ult_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.ltu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.ltu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1057,10 +1144,12 @@ define <2 x i1> @test_fcmp_ule(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ule_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ule_param_1];
-; CHECK-NEXT:    setp.leu.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.leu.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ule_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ule_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.leu.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.leu.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1079,10 +1168,12 @@ define <2 x i1> @test_fcmp_uno(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_uno_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_uno_param_1];
-; CHECK-NEXT:    setp.nan.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.nan.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_uno_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_uno_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.nan.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.nan.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1101,10 +1192,12 @@ define <2 x i1> @test_fcmp_one(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_one_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_one_param_1];
-; CHECK-NEXT:    setp.ne.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.ne.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_one_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_one_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.ne.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.ne.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1123,10 +1216,12 @@ define <2 x i1> @test_fcmp_oeq(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oeq_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oeq_param_1];
-; CHECK-NEXT:    setp.eq.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.eq.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_oeq_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_oeq_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.eq.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.eq.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1145,10 +1240,12 @@ define <2 x i1> @test_fcmp_ogt(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ogt_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ogt_param_1];
-; CHECK-NEXT:    setp.gt.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.gt.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ogt_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ogt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.gt.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.gt.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1167,10 +1264,12 @@ define <2 x i1> @test_fcmp_oge(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_oge_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_oge_param_1];
-; CHECK-NEXT:    setp.ge.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.ge.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_oge_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_oge_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.ge.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.ge.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1189,10 +1288,12 @@ define <2 x i1> @test_fcmp_olt(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_olt_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_olt_param_1];
-; CHECK-NEXT:    setp.lt.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.lt.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_olt_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_olt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.lt.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.lt.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1211,10 +1312,12 @@ define <2 x i1> @test_fcmp_ole(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ole_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ole_param_1];
-; CHECK-NEXT:    setp.le.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.le.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ole_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ole_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.le.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.le.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1233,10 +1336,12 @@ define <2 x i1> @test_fcmp_ord(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fcmp_ord_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fcmp_ord_param_1];
-; CHECK-NEXT:    setp.num.f32 %p1, %r2, %r4;
-; CHECK-NEXT:    setp.num.f32 %p2, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_fcmp_ord_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fcmp_ord_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    setp.num.f32 %p1, %r4, %r2;
+; CHECK-NEXT:    setp.num.f32 %p2, %r3, %r1;
 ; CHECK-NEXT:    selp.b16 %rs1, -1, 0, %p2;
 ; CHECK-NEXT:    st.param.b8 [func_retval0], %rs1;
 ; CHECK-NEXT:    selp.b16 %rs2, -1, 0, %p1;
@@ -1253,7 +1358,8 @@ define <2 x i32> @test_fptosi_i32(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptosi_i32_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.s32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rzi.s32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1269,7 +1375,8 @@ define <2 x i64> @test_fptosi_i64(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptosi_i64_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.s64.f32 %rd2, %r2;
 ; CHECK-NEXT:    cvt.rzi.s64.f32 %rd3, %r1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
@@ -1285,7 +1392,8 @@ define <2 x i32> @test_fptoui_2xi32(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptoui_2xi32_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.u32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rzi.u32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1301,7 +1409,8 @@ define <2 x i64> @test_fptoui_2xi64(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fptoui_2xi64_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.u64.f32 %rd2, %r2;
 ; CHECK-NEXT:    cvt.rzi.u64.f32 %rd3, %r1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
@@ -1380,9 +1489,10 @@ define <2 x float> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x float> %b) #0 {
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
 ; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_uitofp_2xi32_fadd_param_1];
 ; CHECK-NOF32X2-NEXT:    cvt.rn.f32.u32 %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    cvt.rn.f32.u32 %r4, %r2;
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_uitofp_2xi32_fadd_param_1];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r7, %r6, %r4;
 ; CHECK-NOF32X2-NEXT:    add.rn.f32 %r8, %r5, %r3;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
@@ -1431,7 +1541,8 @@ define <2 x double> @test_fpext_2xdouble(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fpext_2xdouble_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.f64.f32 %rd2, %r2;
 ; CHECK-NEXT:    cvt.f64.f32 %rd3, %r1;
 ; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd3, %rd2};
@@ -1499,7 +1610,8 @@ define <2 x float> @test_sqrt(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sqrt_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sqrt_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    sqrt.rn.f32 %r3, %r2;
 ; CHECK-NEXT:    sqrt.rn.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1522,7 +1634,8 @@ define <2 x float> @test_sin(<2 x float> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_sin_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_sin_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    sin.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    sin.approx.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1538,7 +1651,8 @@ define <2 x float> @test_cos(<2 x float> %a) #0 #1 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_cos_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_cos_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cos.approx.f32 %r3, %r2;
 ; CHECK-NEXT:    cos.approx.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1597,11 +1711,14 @@ define <2 x float> @test_fma(<2 x float> %a, <2 x float> %b, <2 x float> %c) #0
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fma_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fma_param_1];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fma_param_2];
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r2, %r4, %r6;
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd3, [test_fma_param_2];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fma_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fma_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r5, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -1627,7 +1744,8 @@ define <2 x float> @test_fabs(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fabs_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_fabs_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    abs.f32 %r3, %r2;
 ; CHECK-NEXT:    abs.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1643,10 +1761,12 @@ define <2 x float> @test_minnum(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_minnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_minnum_param_1];
-; CHECK-NEXT:    min.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    min.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_minnum_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_minnum_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    min.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = call <2 x float> @llvm.minnum(<2 x float> %a, <2 x float> %b)
@@ -1660,10 +1780,12 @@ define <2 x float> @test_maxnum(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_maxnum_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_maxnum_param_1];
-; CHECK-NEXT:    max.f32 %r5, %r2, %r4;
-; CHECK-NEXT:    max.f32 %r6, %r1, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_maxnum_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_maxnum_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd1;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    max.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
 ; CHECK-NEXT:    ret;
   %r = call <2 x float> @llvm.maxnum(<2 x float> %a, <2 x float> %b)
@@ -1677,8 +1799,10 @@ define <2 x float> @test_copysign(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_copysign_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_copysign_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_copysign_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; CHECK-NEXT:    copysign.f32 %r5, %r4, %r2;
 ; CHECK-NEXT:    copysign.f32 %r6, %r3, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r6, %r5};
@@ -1696,18 +1820,19 @@ define <2 x float> @test_copysign_f64(<2 x float> %a, <2 x double> %b) #0 {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd2, %rd3}, [test_copysign_f64_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_f64_param_0];
-; CHECK-NEXT:    abs.f32 %r3, %r2;
-; CHECK-NEXT:    neg.f32 %r4, %r3;
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_copysign_f64_param_0];
 ; CHECK-NEXT:    shr.u64 %rd4, %rd3, 63;
 ; CHECK-NEXT:    and.b64 %rd5, %rd4, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p1, %rd5, 0;
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    abs.f32 %r3, %r2;
+; CHECK-NEXT:    neg.f32 %r4, %r3;
 ; CHECK-NEXT:    selp.f32 %r5, %r4, %r3, %p1;
-; CHECK-NEXT:    abs.f32 %r6, %r1;
-; CHECK-NEXT:    neg.f32 %r7, %r6;
 ; CHECK-NEXT:    shr.u64 %rd6, %rd2, 63;
 ; CHECK-NEXT:    and.b64 %rd7, %rd6, 1;
 ; CHECK-NEXT:    setp.ne.b64 %p2, %rd7, 0;
+; CHECK-NEXT:    abs.f32 %r6, %r1;
+; CHECK-NEXT:    neg.f32 %r7, %r6;
 ; CHECK-NEXT:    selp.f32 %r8, %r7, %r6, %p2;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r5};
 ; CHECK-NEXT:    ret;
@@ -1723,8 +1848,10 @@ define <2 x double> @test_copysign_extended(<2 x float> %a, <2 x float> %b) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_copysign_extended_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_copysign_extended_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd2, [test_copysign_extended_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_copysign_extended_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
 ; CHECK-NEXT:    copysign.f32 %r5, %r3, %r1;
 ; CHECK-NEXT:    copysign.f32 %r6, %r4, %r2;
 ; CHECK-NEXT:    cvt.f64.f32 %rd3, %r6;
@@ -1743,7 +1870,8 @@ define <2 x float> @test_floor(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_floor_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_floor_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rmi.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rmi.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1759,7 +1887,8 @@ define <2 x float> @test_ceil(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_ceil_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_ceil_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rpi.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rpi.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1775,7 +1904,8 @@ define <2 x float> @test_trunc(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_trunc_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_trunc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rzi.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1791,7 +1921,8 @@ define <2 x float> @test_rint(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_rint_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_rint_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1807,7 +1938,8 @@ define <2 x float> @test_nearbyint(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_nearbyint_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_nearbyint_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1823,7 +1955,8 @@ define <2 x float> @test_roundeven(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_roundeven_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_roundeven_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r3, %r2;
 ; CHECK-NEXT:    cvt.rni.f32.f32 %r4, %r1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r4, %r3};
@@ -1841,7 +1974,8 @@ define <2 x float> @test_round(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_round_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_round_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    and.b32 %r3, %r2, -2147483648;
 ; CHECK-NEXT:    or.b32 %r4, %r3, 1056964608;
 ; CHECK-NEXT:    add.rn.f32 %r5, %r2, %r4;
@@ -1875,11 +2009,14 @@ define <2 x float> @test_fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
 ; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-NOF32X2-EMPTY:
 ; CHECK-NOF32X2-NEXT:  // %bb.0:
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fmuladd_param_0];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_fmuladd_param_1];
-; CHECK-NOF32X2-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_fmuladd_param_2];
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r2, %r4, %r6;
-; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r1, %r3, %r5;
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd3, [test_fmuladd_param_2];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd2, [test_fmuladd_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_fmuladd_param_0];
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r1, %r2}, %rd3;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NOF32X2-NEXT:    mov.b64 {%r5, %r6}, %rd1;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r7, %r6, %r4, %r2;
+; CHECK-NOF32X2-NEXT:    fma.rn.f32 %r8, %r5, %r3, %r1;
 ; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
 ; CHECK-NOF32X2-NEXT:    ret;
 ;
@@ -1905,7 +2042,8 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
 ; CHECK-NEXT:    .reg .b64 %rd<2>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_shufflevector_param_0];
+; CHECK-NEXT:    ld.param.b64 %rd1, [test_shufflevector_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd1;
 ; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
 ; CHECK-NEXT:    ret;
   %s = shufflevector <2 x float> %a, <2 x float> poison, <2 x i32> <i32 1, i32 0>
@@ -1913,16 +2051,29 @@ define <2 x float> @test_shufflevector(<2 x float> %a) #0 {
 }
 
 define <2 x float> @test_insertelement(<2 x float> %a, float %x) #0 {
-; CHECK-LABEL: test_insertelement(
-; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<2>;
-; CHECK-EMPTY:
-; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%r2, %r3}, [test_insertelement_param_0];
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
-; CHECK-NEXT:    ret;
+; CHECK-NOF32X2-LABEL: test_insertelement(
+; CHECK-NOF32X2:       {
+; CHECK-NOF32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-NOF32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-NOF32X2-EMPTY:
+; CHECK-NOF32X2-NEXT:  // %bb.0:
+; CHECK-NOF32X2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
+; CHECK-NOF32X2-NEXT:    ld.param.b64 %rd1, [test_insertelement_param_0];
+; CHECK-NOF32X2-NEXT:    { .reg .b32 tmp; mov.b64 {%r2, tmp}, %rd1; }
+; CHECK-NOF32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-NOF32X2-NEXT:    ret;
+;
+; CHECK-F32X2-LABEL: test_insertelement(
+; CHECK-F32X2:       {
+; CHECK-F32X2-NEXT:    .reg .b32 %r<3>;
+; CHECK-F32X2-NEXT:    .reg .b64 %rd<2>;
+; CHECK-F32X2-EMPTY:
+; CHECK-F32X2-NEXT:  // %bb.0:
+; CHECK-F32X2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_1];
+; CHECK-F32X2-NEXT:    ld.param.b64 %rd1, [test_insertelement_param_0];
+; CHECK-F32X2-NEXT:    mov.b64 {%r2, _}, %rd1;
+; CHECK-F32X2-NEXT:    st.param.v2.b32 [func_retval0], {%r2, %r1};
+; CHECK-F32X2-NEXT:    ret;
   %i = insertelement <2 x float> %a, float %x, i64 1
   ret <2 x float> %i
 }
diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll
index f1adc3489c0d..9a051b3fd8bb 100644
--- a/llvm/test/CodeGen/NVPTX/i1-select.ll
+++ b/llvm/test/CodeGen/NVPTX/i1-select.ll
@@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
 define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) {
 ; CHECK-LABEL: test_select_i1_basic_folding(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<12>;
-; CHECK-NEXT:    .reg .b32 %r<9>;
+; CHECK-NEXT:    .reg .pred %p<13>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
 ; CHECK-NEXT:    setp.eq.b32 %p1, %r1, 0;
-; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_1];
-; CHECK-NEXT:    setp.ne.b32 %p2, %r3, 0;
-; CHECK-NEXT:    setp.eq.b32 %p3, %r3, 0;
-; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_2];
-; CHECK-NEXT:    setp.eq.b32 %p4, %r5, 0;
-; CHECK-NEXT:    ld.param.b32 %r6, [test_select_i1_basic_folding_param_3];
+; CHECK-NEXT:    ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
+; CHECK-NEXT:    setp.ne.b32 %p2, %r2, 0;
+; CHECK-NEXT:    setp.eq.b32 %p3, %r2, 0;
+; CHECK-NEXT:    ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
+; CHECK-NEXT:    setp.eq.b32 %p4, %r3, 0;
+; CHECK-NEXT:    ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
 ; CHECK-NEXT:    xor.pred %p6, %p1, %p3;
-; CHECK-NEXT:    ld.param.b32 %r7, [test_select_i1_basic_folding_param_4];
+; CHECK-NEXT:    ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
 ; CHECK-NEXT:    and.pred %p7, %p6, %p4;
-; CHECK-NEXT:    and.pred %p8, %p2, %p4;
-; CHECK-NEXT:    and.pred %p9, %p3, %p7;
-; CHECK-NEXT:    or.pred %p10, %p9, %p8;
-; CHECK-NEXT:    xor.pred %p11, %p10, %p3;
-; CHECK-NEXT:    selp.b32 %r8, %r6, %r7, %p11;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r8;
+; CHECK-NEXT:    and.pred %p9, %p2, %p4;
+; CHECK-NEXT:    and.pred %p10, %p3, %p7;
+; CHECK-NEXT:    or.pred %p11, %p10, %p9;
+; CHECK-NEXT:    xor.pred %p12, %p11, %p3;
+; CHECK-NEXT:    selp.b32 %r6, %r4, %r5, %p12;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %b1 = icmp eq i32 %v1, 0
   %b2 = icmp eq i32 %v2, 0
diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll
index f2211eb1c0b8..44d85589b505 100644
--- a/llvm/test/CodeGen/NVPTX/i128.ll
+++ b/llvm/test/CodeGen/NVPTX/i128.ll
@@ -5,9 +5,9 @@
 define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: srem_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<22>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<126>;
+; CHECK-NEXT:    .reg .b64 %rd<127>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
@@ -42,103 +42,102 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd62, %r4;
 ; CHECK-NEXT:    add.s64 %rd63, %rd62, 64;
 ; CHECK-NEXT:    selp.b64 %rd64, %rd61, %rd63, %p7;
-; CHECK-NEXT:    mov.b64 %rd116, 0;
+; CHECK-NEXT:    mov.b64 %rd117, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd66, %rd60, %rd64;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd116, 0;
-; CHECK-NEXT:    setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT:    and.pred %p10, %p8, %p8;
-; CHECK-NEXT:    setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT:    setp.gt.u64 %p12, %rd66, 127;
-; CHECK-NEXT:    and.pred %p13, %p11, %p12;
-; CHECK-NEXT:    or.pred %p14, %p13, %p10;
-; CHECK-NEXT:    or.pred %p15, %p5, %p14;
-; CHECK-NEXT:    xor.b64 %rd67, %rd66, 127;
-; CHECK-NEXT:    or.b64 %rd68, %rd67, %rd8;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd68, 0;
-; CHECK-NEXT:    selp.b64 %rd125, 0, %rd4, %p15;
-; CHECK-NEXT:    selp.b64 %rd124, 0, %rd3, %p15;
-; CHECK-NEXT:    or.pred %p17, %p15, %p16;
-; CHECK-NEXT:    @%p17 bra $L__BB0_5;
+; CHECK-NEXT:    subc.cc.s64 %rd67, %rd117, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd66, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd67, 0;
+; CHECK-NEXT:    and.pred %p10, %p9, %p8;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd67, 0;
+; CHECK-NEXT:    or.pred %p12, %p10, %p11;
+; CHECK-NEXT:    or.pred %p13, %p5, %p12;
+; CHECK-NEXT:    xor.b64 %rd68, %rd66, 127;
+; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd67;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd69, 0;
+; CHECK-NEXT:    selp.b64 %rd126, 0, %rd4, %p13;
+; CHECK-NEXT:    selp.b64 %rd125, 0, %rd3, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB0_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd118, %rd66, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd119, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd71, %rd118, %rd119;
-; CHECK-NEXT:    setp.eq.b64 %p18, %rd71, 0;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd66, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd67, 0;
+; CHECK-NEXT:    or.b64 %rd72, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd72, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r5, %rd66;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd72, %rd4, %r6;
+; CHECK-NEXT:    shl.b64 %rd73, %rd4, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd73, %rd3, %r7;
-; CHECK-NEXT:    or.b64 %rd74, %rd72, %rd73;
+; CHECK-NEXT:    shr.u64 %rd74, %rd3, %r7;
+; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd75, %rd3, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd123, %rd75, %rd74, %p19;
-; CHECK-NEXT:    shl.b64 %rd122, %rd3, %r6;
-; CHECK-NEXT:    mov.b64 %rd113, %rd116;
-; CHECK-NEXT:    @%p18 bra $L__BB0_4;
+; CHECK-NEXT:    shl.b64 %rd76, %rd3, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd124, %rd76, %rd75, %p17;
+; CHECK-NEXT:    shl.b64 %rd123, %rd3, %r6;
+; CHECK-NEXT:    mov.b64 %rd114, %rd117;
+; CHECK-NEXT:    @%p16 bra $L__BB0_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd118;
-; CHECK-NEXT:    shr.u64 %rd78, %rd3, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd119;
+; CHECK-NEXT:    shr.u64 %rd79, %rd3, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd79, %rd4, %r10;
-; CHECK-NEXT:    or.b64 %rd80, %rd78, %rd79;
+; CHECK-NEXT:    shl.b64 %rd80, %rd4, %r10;
+; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd81, %rd4, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd120, %rd81, %rd80, %p20;
-; CHECK-NEXT:    shr.u64 %rd121, %rd4, %r9;
+; CHECK-NEXT:    shr.u64 %rd82, %rd4, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd121, %rd82, %rd81, %p18;
+; CHECK-NEXT:    shr.u64 %rd122, %rd4, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd5, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd6, -1;
-; CHECK-NEXT:    mov.b64 %rd113, 0;
-; CHECK-NEXT:    mov.b64 %rd116, %rd113;
+; CHECK-NEXT:    mov.b64 %rd114, 0;
+; CHECK-NEXT:    mov.b64 %rd117, %rd114;
 ; CHECK-NEXT:  $L__BB0_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd82, %rd120, 63;
-; CHECK-NEXT:    shl.b64 %rd83, %rd121, 1;
-; CHECK-NEXT:    or.b64 %rd84, %rd83, %rd82;
-; CHECK-NEXT:    shl.b64 %rd85, %rd120, 1;
-; CHECK-NEXT:    shr.u64 %rd86, %rd123, 63;
-; CHECK-NEXT:    or.b64 %rd87, %rd85, %rd86;
-; CHECK-NEXT:    shr.u64 %rd88, %rd122, 63;
-; CHECK-NEXT:    shl.b64 %rd89, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd90, %rd89, %rd88;
-; CHECK-NEXT:    shl.b64 %rd91, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd122, %rd116, %rd91;
-; CHECK-NEXT:    or.b64 %rd123, %rd113, %rd90;
-; CHECK-NEXT:    sub.cc.s64 %rd92, %rd35, %rd87;
-; CHECK-NEXT:    subc.cc.s64 %rd93, %rd36, %rd84;
-; CHECK-NEXT:    shr.s64 %rd94, %rd93, 63;
-; CHECK-NEXT:    and.b64 %rd116, %rd94, 1;
-; CHECK-NEXT:    and.b64 %rd95, %rd94, %rd5;
-; CHECK-NEXT:    and.b64 %rd96, %rd94, %rd6;
-; CHECK-NEXT:    sub.cc.s64 %rd120, %rd87, %rd95;
-; CHECK-NEXT:    subc.cc.s64 %rd121, %rd84, %rd96;
-; CHECK-NEXT:    add.cc.s64 %rd118, %rd118, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd119, %rd119, -1;
-; CHECK-NEXT:    or.b64 %rd97, %rd118, %rd119;
-; CHECK-NEXT:    setp.eq.b64 %p21, %rd97, 0;
-; CHECK-NEXT:    @%p21 bra $L__BB0_4;
+; CHECK-NEXT:    shr.u64 %rd83, %rd121, 63;
+; CHECK-NEXT:    shl.b64 %rd84, %rd122, 1;
+; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
+; CHECK-NEXT:    shl.b64 %rd86, %rd121, 1;
+; CHECK-NEXT:    shr.u64 %rd87, %rd124, 63;
+; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
+; CHECK-NEXT:    shr.u64 %rd89, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd123, %rd117, %rd92;
+; CHECK-NEXT:    or.b64 %rd124, %rd114, %rd91;
+; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
+; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
+; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
+; CHECK-NEXT:    and.b64 %rd117, %rd95, 1;
+; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd5;
+; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd6;
+; CHECK-NEXT:    sub.cc.s64 %rd121, %rd88, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd122, %rd85, %rd97;
+; CHECK-NEXT:    add.cc.s64 %rd119, %rd119, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd120, %rd120, -1;
+; CHECK-NEXT:    or.b64 %rd98, %rd119, %rd120;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd98, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB0_4;
 ; CHECK-NEXT:    bra.uni $L__BB0_2;
 ; CHECK-NEXT:  $L__BB0_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd98, %rd122, 63;
-; CHECK-NEXT:    shl.b64 %rd99, %rd123, 1;
-; CHECK-NEXT:    or.b64 %rd100, %rd99, %rd98;
-; CHECK-NEXT:    shl.b64 %rd101, %rd122, 1;
-; CHECK-NEXT:    or.b64 %rd124, %rd116, %rd101;
-; CHECK-NEXT:    or.b64 %rd125, %rd113, %rd100;
+; CHECK-NEXT:    shr.u64 %rd99, %rd123, 63;
+; CHECK-NEXT:    shl.b64 %rd100, %rd124, 1;
+; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
+; CHECK-NEXT:    shl.b64 %rd102, %rd123, 1;
+; CHECK-NEXT:    or.b64 %rd125, %rd117, %rd102;
+; CHECK-NEXT:    or.b64 %rd126, %rd114, %rd101;
 ; CHECK-NEXT:  $L__BB0_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd102, %rd5, %rd124;
-; CHECK-NEXT:    mad.lo.s64 %rd103, %rd5, %rd125, %rd102;
-; CHECK-NEXT:    mad.lo.s64 %rd104, %rd6, %rd124, %rd103;
-; CHECK-NEXT:    mul.lo.s64 %rd105, %rd5, %rd124;
-; CHECK-NEXT:    sub.cc.s64 %rd106, %rd3, %rd105;
-; CHECK-NEXT:    subc.cc.s64 %rd107, %rd4, %rd104;
-; CHECK-NEXT:    xor.b64 %rd108, %rd106, %rd2;
+; CHECK-NEXT:    mul.hi.u64 %rd103, %rd5, %rd125;
+; CHECK-NEXT:    mad.lo.s64 %rd104, %rd5, %rd126, %rd103;
+; CHECK-NEXT:    mad.lo.s64 %rd105, %rd6, %rd125, %rd104;
+; CHECK-NEXT:    mul.lo.s64 %rd106, %rd5, %rd125;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd3, %rd106;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd4, %rd105;
 ; CHECK-NEXT:    xor.b64 %rd109, %rd107, %rd2;
-; CHECK-NEXT:    sub.cc.s64 %rd110, %rd108, %rd2;
-; CHECK-NEXT:    subc.cc.s64 %rd111, %rd109, %rd2;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd110, %rd111};
+; CHECK-NEXT:    xor.b64 %rd110, %rd108, %rd2;
+; CHECK-NEXT:    sub.cc.s64 %rd111, %rd109, %rd2;
+; CHECK-NEXT:    subc.cc.s64 %rd112, %rd110, %rd2;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd111, %rd112};
 ; CHECK-NEXT:    ret;
   %div = srem i128 %lhs, %rhs
   ret i128 %div
@@ -149,7 +148,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<111>;
+; CHECK-NEXT:    .reg .b64 %rd<113>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
@@ -173,98 +172,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd101, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd101, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.b64 %rd103, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd103, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd110, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd109, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd112, 0, %rd42, %p11;
+; CHECK-NEXT:    selp.b64 %rd111, 0, %rd41, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB1_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd103, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd104, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd103, %rd104;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd108, %rd64, %rd63, %p15;
-; CHECK-NEXT:    shl.b64 %rd107, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd98, %rd101;
+; CHECK-NEXT:    selp.b64 %rd110, %rd66, %rd65, %p15;
+; CHECK-NEXT:    shl.b64 %rd109, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd100, %rd103;
 ; CHECK-NEXT:    @%p14 bra $L__BB1_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd103;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd105;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd105, %rd70, %rd69, %p16;
-; CHECK-NEXT:    shr.u64 %rd106, %rd42, %r9;
+; CHECK-NEXT:    selp.b64 %rd107, %rd72, %rd71, %p16;
+; CHECK-NEXT:    shr.u64 %rd108, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd98, 0;
-; CHECK-NEXT:    mov.b64 %rd101, %rd98;
+; CHECK-NEXT:    mov.b64 %rd100, 0;
+; CHECK-NEXT:    mov.b64 %rd103, %rd100;
 ; CHECK-NEXT:  $L__BB1_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd105, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd106, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd105, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd108, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd107, 1;
-; CHECK-NEXT:    or.b64 %rd107, %rd101, %rd80;
-; CHECK-NEXT:    or.b64 %rd108, %rd98, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd101, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd3;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd105, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd106, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd103, %rd103, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd104, %rd104, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd103, %rd104;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd107, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd108, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd107, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd110, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd109, %rd103, %rd82;
+; CHECK-NEXT:    or.b64 %rd110, %rd100, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd103, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd3;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd107, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd108, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd105, %rd105, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd106, %rd106, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd105, %rd106;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB1_4;
 ; CHECK-NEXT:    bra.uni $L__BB1_2;
 ; CHECK-NEXT:  $L__BB1_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd107, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd108, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd107, 1;
-; CHECK-NEXT:    or.b64 %rd109, %rd101, %rd90;
-; CHECK-NEXT:    or.b64 %rd110, %rd98, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd109, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd110, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd109, 1;
+; CHECK-NEXT:    or.b64 %rd111, %rd103, %rd92;
+; CHECK-NEXT:    or.b64 %rd112, %rd100, %rd91;
 ; CHECK-NEXT:  $L__BB1_5: // %udiv-end
-; CHECK-NEXT:    mul.hi.u64 %rd91, %rd3, %rd109;
-; CHECK-NEXT:    mad.lo.s64 %rd92, %rd3, %rd110, %rd91;
-; CHECK-NEXT:    mad.lo.s64 %rd93, %rd4, %rd109, %rd92;
-; CHECK-NEXT:    mul.lo.s64 %rd94, %rd3, %rd109;
-; CHECK-NEXT:    sub.cc.s64 %rd95, %rd41, %rd94;
-; CHECK-NEXT:    subc.cc.s64 %rd96, %rd42, %rd93;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd95, %rd96};
+; CHECK-NEXT:    mul.hi.u64 %rd93, %rd3, %rd111;
+; CHECK-NEXT:    mad.lo.s64 %rd94, %rd3, %rd112, %rd93;
+; CHECK-NEXT:    mad.lo.s64 %rd95, %rd4, %rd111, %rd94;
+; CHECK-NEXT:    mul.lo.s64 %rd96, %rd3, %rd111;
+; CHECK-NEXT:    sub.cc.s64 %rd97, %rd41, %rd96;
+; CHECK-NEXT:    subc.cc.s64 %rd98, %rd42, %rd95;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd97, %rd98};
 ; CHECK-NEXT:    ret;
   %div = urem i128 %lhs, %rhs
   ret i128 %div
@@ -307,9 +306,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
 define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: sdiv_i128(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<22>;
+; CHECK-NEXT:    .reg .pred %p<20>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<121>;
+; CHECK-NEXT:    .reg .b64 %rd<122>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
@@ -345,97 +344,96 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd63, %r4;
 ; CHECK-NEXT:    add.s64 %rd64, %rd63, 64;
 ; CHECK-NEXT:    selp.b64 %rd65, %rd62, %rd64, %p7;
-; CHECK-NEXT:    mov.b64 %rd111, 0;
+; CHECK-NEXT:    mov.b64 %rd112, 0;
 ; CHECK-NEXT:    sub.cc.s64 %rd67, %rd61, %rd65;
-; CHECK-NEXT:    subc.cc.s64 %rd8, %rd111, 0;
-; CHECK-NEXT:    setp.ne.b64 %p8, %rd8, 0;
-; CHECK-NEXT:    and.pred %p10, %p8, %p8;
-; CHECK-NEXT:    setp.eq.b64 %p11, %rd8, 0;
-; CHECK-NEXT:    setp.gt.u64 %p12, %rd67, 127;
-; CHECK-NEXT:    and.pred %p13, %p11, %p12;
-; CHECK-NEXT:    or.pred %p14, %p13, %p10;
-; CHECK-NEXT:    or.pred %p15, %p5, %p14;
-; CHECK-NEXT:    xor.b64 %rd68, %rd67, 127;
-; CHECK-NEXT:    or.b64 %rd69, %rd68, %rd8;
-; CHECK-NEXT:    setp.eq.b64 %p16, %rd69, 0;
-; CHECK-NEXT:    selp.b64 %rd120, 0, %rd2, %p15;
-; CHECK-NEXT:    selp.b64 %rd119, 0, %rd1, %p15;
-; CHECK-NEXT:    or.pred %p17, %p15, %p16;
-; CHECK-NEXT:    @%p17 bra $L__BB4_5;
+; CHECK-NEXT:    subc.cc.s64 %rd68, %rd112, 0;
+; CHECK-NEXT:    setp.gt.u64 %p8, %rd67, 127;
+; CHECK-NEXT:    setp.eq.b64 %p9, %rd68, 0;
+; CHECK-NEXT:    and.pred %p10, %p9, %p8;
+; CHECK-NEXT:    setp.ne.b64 %p11, %rd68, 0;
+; CHECK-NEXT:    or.pred %p12, %p10, %p11;
+; CHECK-NEXT:    or.pred %p13, %p5, %p12;
+; CHECK-NEXT:    xor.b64 %rd69, %rd67, 127;
+; CHECK-NEXT:    or.b64 %rd70, %rd69, %rd68;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd70, 0;
+; CHECK-NEXT:    selp.b64 %rd121, 0, %rd2, %p13;
+; CHECK-NEXT:    selp.b64 %rd120, 0, %rd1, %p13;
+; CHECK-NEXT:    or.pred %p15, %p13, %p14;
+; CHECK-NEXT:    @%p15 bra $L__BB4_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd113, %rd67, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd114, %rd8, 0;
-; CHECK-NEXT:    or.b64 %rd72, %rd113, %rd114;
-; CHECK-NEXT:    setp.eq.b64 %p18, %rd72, 0;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd67, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd68, 0;
+; CHECK-NEXT:    or.b64 %rd73, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.b64 %p16, %rd73, 0;
 ; CHECK-NEXT:    cvt.u32.u64 %r5, %rd67;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd73, %rd2, %r6;
+; CHECK-NEXT:    shl.b64 %rd74, %rd2, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd74, %rd1, %r7;
-; CHECK-NEXT:    or.b64 %rd75, %rd73, %rd74;
+; CHECK-NEXT:    shr.u64 %rd75, %rd1, %r7;
+; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd76, %rd1, %r8;
-; CHECK-NEXT:    setp.gt.s32 %p19, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd118, %rd76, %rd75, %p19;
-; CHECK-NEXT:    shl.b64 %rd117, %rd1, %r6;
-; CHECK-NEXT:    mov.b64 %rd108, %rd111;
-; CHECK-NEXT:    @%p18 bra $L__BB4_4;
+; CHECK-NEXT:    shl.b64 %rd77, %rd1, %r8;
+; CHECK-NEXT:    setp.gt.s32 %p17, %r6, 63;
+; CHECK-NEXT:    selp.b64 %rd119, %rd77, %rd76, %p17;
+; CHECK-NEXT:    shl.b64 %rd118, %rd1, %r6;
+; CHECK-NEXT:    mov.b64 %rd109, %rd112;
+; CHECK-NEXT:    @%p16 bra $L__BB4_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd113;
-; CHECK-NEXT:    shr.u64 %rd79, %rd1, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd114;
+; CHECK-NEXT:    shr.u64 %rd80, %rd1, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd80, %rd2, %r10;
-; CHECK-NEXT:    or.b64 %rd81, %rd79, %rd80;
+; CHECK-NEXT:    shl.b64 %rd81, %rd2, %r10;
+; CHECK-NEXT:    or.b64 %rd82, %rd80, %rd81;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd82, %rd2, %r11;
-; CHECK-NEXT:    setp.gt.s32 %p20, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd115, %rd82, %rd81, %p20;
-; CHECK-NEXT:    shr.u64 %rd116, %rd2, %r9;
+; CHECK-NEXT:    shr.u64 %rd83, %rd2, %r11;
+; CHECK-NEXT:    setp.gt.s32 %p18, %r9, 63;
+; CHECK-NEXT:    selp.b64 %rd116, %rd83, %rd82, %p18;
+; CHECK-NEXT:    shr.u64 %rd117, %rd2, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd35, %rd3, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd36, %rd4, -1;
-; CHECK-NEXT:    mov.b64 %rd108, 0;
-; CHECK-NEXT:    mov.b64 %rd111, %rd108;
+; CHECK-NEXT:    mov.b64 %rd109, 0;
+; CHECK-NEXT:    mov.b64 %rd112, %rd109;
 ; CHECK-NEXT:  $L__BB4_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd83, %rd115, 63;
-; CHECK-NEXT:    shl.b64 %rd84, %rd116, 1;
-; CHECK-NEXT:    or.b64 %rd85, %rd84, %rd83;
-; CHECK-NEXT:    shl.b64 %rd86, %rd115, 1;
-; CHECK-NEXT:    shr.u64 %rd87, %rd118, 63;
-; CHECK-NEXT:    or.b64 %rd88, %rd86, %rd87;
-; CHECK-NEXT:    shr.u64 %rd89, %rd117, 63;
-; CHECK-NEXT:    shl.b64 %rd90, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
-; CHECK-NEXT:    shl.b64 %rd92, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd117, %rd111, %rd92;
-; CHECK-NEXT:    or.b64 %rd118, %rd108, %rd91;
-; CHECK-NEXT:    sub.cc.s64 %rd93, %rd35, %rd88;
-; CHECK-NEXT:    subc.cc.s64 %rd94, %rd36, %rd85;
-; CHECK-NEXT:    shr.s64 %rd95, %rd94, 63;
-; CHECK-NEXT:    and.b64 %rd111, %rd95, 1;
-; CHECK-NEXT:    and.b64 %rd96, %rd95, %rd3;
-; CHECK-NEXT:    and.b64 %rd97, %rd95, %rd4;
-; CHECK-NEXT:    sub.cc.s64 %rd115, %rd88, %rd96;
-; CHECK-NEXT:    subc.cc.s64 %rd116, %rd85, %rd97;
-; CHECK-NEXT:    add.cc.s64 %rd113, %rd113, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd114, %rd114, -1;
-; CHECK-NEXT:    or.b64 %rd98, %rd113, %rd114;
-; CHECK-NEXT:    setp.eq.b64 %p21, %rd98, 0;
-; CHECK-NEXT:    @%p21 bra $L__BB4_4;
+; CHECK-NEXT:    shr.u64 %rd84, %rd116, 63;
+; CHECK-NEXT:    shl.b64 %rd85, %rd117, 1;
+; CHECK-NEXT:    or.b64 %rd86, %rd85, %rd84;
+; CHECK-NEXT:    shl.b64 %rd87, %rd116, 1;
+; CHECK-NEXT:    shr.u64 %rd88, %rd119, 63;
+; CHECK-NEXT:    or.b64 %rd89, %rd87, %rd88;
+; CHECK-NEXT:    shr.u64 %rd90, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd91, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd92, %rd91, %rd90;
+; CHECK-NEXT:    shl.b64 %rd93, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd118, %rd112, %rd93;
+; CHECK-NEXT:    or.b64 %rd119, %rd109, %rd92;
+; CHECK-NEXT:    sub.cc.s64 %rd94, %rd35, %rd89;
+; CHECK-NEXT:    subc.cc.s64 %rd95, %rd36, %rd86;
+; CHECK-NEXT:    shr.s64 %rd96, %rd95, 63;
+; CHECK-NEXT:    and.b64 %rd112, %rd96, 1;
+; CHECK-NEXT:    and.b64 %rd97, %rd96, %rd3;
+; CHECK-NEXT:    and.b64 %rd98, %rd96, %rd4;
+; CHECK-NEXT:    sub.cc.s64 %rd116, %rd89, %rd97;
+; CHECK-NEXT:    subc.cc.s64 %rd117, %rd86, %rd98;
+; CHECK-NEXT:    add.cc.s64 %rd114, %rd114, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd115, %rd115, -1;
+; CHECK-NEXT:    or.b64 %rd99, %rd114, %rd115;
+; CHECK-NEXT:    setp.eq.b64 %p19, %rd99, 0;
+; CHECK-NEXT:    @%p19 bra $L__BB4_4;
 ; CHECK-NEXT:    bra.uni $L__BB4_2;
 ; CHECK-NEXT:  $L__BB4_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd99, %rd117, 63;
-; CHECK-NEXT:    shl.b64 %rd100, %rd118, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd100, %rd99;
-; CHECK-NEXT:    shl.b64 %rd102, %rd117, 1;
-; CHECK-NEXT:    or.b64 %rd119, %rd111, %rd102;
-; CHECK-NEXT:    or.b64 %rd120, %rd108, %rd101;
+; CHECK-NEXT:    shr.u64 %rd100, %rd118, 63;
+; CHECK-NEXT:    shl.b64 %rd101, %rd119, 1;
+; CHECK-NEXT:    or.b64 %rd102, %rd101, %rd100;
+; CHECK-NEXT:    shl.b64 %rd103, %rd118, 1;
+; CHECK-NEXT:    or.b64 %rd120, %rd112, %rd103;
+; CHECK-NEXT:    or.b64 %rd121, %rd109, %rd102;
 ; CHECK-NEXT:  $L__BB4_5: // %udiv-end
-; CHECK-NEXT:    xor.b64 %rd103, %rd119, %rd5;
 ; CHECK-NEXT:    xor.b64 %rd104, %rd120, %rd5;
-; CHECK-NEXT:    sub.cc.s64 %rd105, %rd103, %rd5;
-; CHECK-NEXT:    subc.cc.s64 %rd106, %rd104, %rd5;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
+; CHECK-NEXT:    xor.b64 %rd105, %rd121, %rd5;
+; CHECK-NEXT:    sub.cc.s64 %rd106, %rd104, %rd5;
+; CHECK-NEXT:    subc.cc.s64 %rd107, %rd105, %rd5;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd106, %rd107};
 ; CHECK-NEXT:    ret;
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
@@ -446,7 +444,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .pred %p<18>;
 ; CHECK-NEXT:    .reg .b32 %r<12>;
-; CHECK-NEXT:    .reg .b64 %rd<105>;
+; CHECK-NEXT:    .reg .b64 %rd<107>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0: // %_udiv-special-cases
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
@@ -470,92 +468,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
 ; CHECK-NEXT:    cvt.u64.u32 %rd52, %r4;
 ; CHECK-NEXT:    add.s64 %rd53, %rd52, 64;
 ; CHECK-NEXT:    selp.b64 %rd54, %rd51, %rd53, %p5;
-; CHECK-NEXT:    mov.b64 %rd95, 0;
-; CHECK-NEXT:    sub.cc.s64 %rd5, %rd50, %rd54;
-; CHECK-NEXT:    subc.cc.s64 %rd6, %rd95, 0;
-; CHECK-NEXT:    setp.gt.u64 %p6, %rd5, 127;
-; CHECK-NEXT:    setp.eq.b64 %p7, %rd6, 0;
+; CHECK-NEXT:    mov.b64 %rd97, 0;
+; CHECK-NEXT:    sub.cc.s64 %rd56, %rd50, %rd54;
+; CHECK-NEXT:    subc.cc.s64 %rd57, %rd97, 0;
+; CHECK-NEXT:    setp.gt.u64 %p6, %rd56, 127;
+; CHECK-NEXT:    setp.eq.b64 %p7, %rd57, 0;
 ; CHECK-NEXT:    and.pred %p8, %p7, %p6;
-; CHECK-NEXT:    setp.ne.b64 %p9, %rd6, 0;
+; CHECK-NEXT:    setp.ne.b64 %p9, %rd57, 0;
 ; CHECK-NEXT:    or.pred %p10, %p8, %p9;
 ; CHECK-NEXT:    or.pred %p11, %p3, %p10;
-; CHECK-NEXT:    xor.b64 %rd56, %rd5, 127;
-; CHECK-NEXT:    or.b64 %rd57, %rd56, %rd6;
-; CHECK-NEXT:    setp.eq.b64 %p12, %rd57, 0;
-; CHECK-NEXT:    selp.b64 %rd104, 0, %rd42, %p11;
-; CHECK-NEXT:    selp.b64 %rd103, 0, %rd41, %p11;
+; CHECK-NEXT:    xor.b64 %rd58, %rd56, 127;
+; CHECK-NEXT:    or.b64 %rd59, %rd58, %rd57;
+; CHECK-NEXT:    setp.eq.b64 %p12, %rd59, 0;
+; CHECK-NEXT:    selp.b64 %rd106, 0, %rd42, %p11;
+; CHECK-NEXT:    selp.b64 %rd105, 0, %rd41, %p11;
 ; CHECK-NEXT:    or.pred %p13, %p11, %p12;
 ; CHECK-NEXT:    @%p13 bra $L__BB5_5;
 ; CHECK-NEXT:  // %bb.3: // %udiv-bb1
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd5, 1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd6, 0;
-; CHECK-NEXT:    or.b64 %rd60, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.b64 %p14, %rd60, 0;
-; CHECK-NEXT:    cvt.u32.u64 %r5, %rd5;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd56, 1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd57, 0;
+; CHECK-NEXT:    or.b64 %rd62, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.b64 %p14, %rd62, 0;
+; CHECK-NEXT:    cvt.u32.u64 %r5, %rd56;
 ; CHECK-NEXT:    sub.s32 %r6, 127, %r5;
-; CHECK-NEXT:    shl.b64 %rd61, %rd42, %r6;
+; CHECK-NEXT:    shl.b64 %rd63, %rd42, %r6;
 ; CHECK-NEXT:    sub.s32 %r7, 64, %r6;
-; CHECK-NEXT:    shr.u64 %rd62, %rd41, %r7;
-; CHECK-NEXT:    or.b64 %rd63, %rd61, %rd62;
+; CHECK-NEXT:    shr.u64 %rd64, %rd41, %r7;
+; CHECK-NEXT:    or.b64 %rd65, %rd63, %rd64;
 ; CHECK-NEXT:    sub.s32 %r8, 63, %r5;
-; CHECK-NEXT:    shl.b64 %rd64, %rd41, %r8;
+; CHECK-NEXT:    shl.b64 %rd66, %rd41, %r8;
 ; CHECK-NEXT:    setp.gt.s32 %p15, %r6, 63;
-; CHECK-NEXT:    selp.b64 %rd102, %rd64, %rd63, %p15;
-; CHECK-NEXT:    shl.b64 %rd101, %rd41, %r6;
-; CHECK-NEXT:    mov.b64 %rd92, %rd95;
+; CHECK-NEXT:    selp.b64 %rd104, %rd66, %rd65, %p15;
+; CHECK-NEXT:    shl.b64 %rd103, %rd41, %r6;
+; CHECK-NEXT:    mov.b64 %rd94, %rd97;
 ; CHECK-NEXT:    @%p14 bra $L__BB5_4;
 ; CHECK-NEXT:  // %bb.1: // %udiv-preheader
-; CHECK-NEXT:    cvt.u32.u64 %r9, %rd97;
-; CHECK-NEXT:    shr.u64 %rd67, %rd41, %r9;
+; CHECK-NEXT:    cvt.u32.u64 %r9, %rd99;
+; CHECK-NEXT:    shr.u64 %rd69, %rd41, %r9;
 ; CHECK-NEXT:    sub.s32 %r10, 64, %r9;
-; CHECK-NEXT:    shl.b64 %rd68, %rd42, %r10;
-; CHECK-NEXT:    or.b64 %rd69, %rd67, %rd68;
+; CHECK-NEXT:    shl.b64 %rd70, %rd42, %r10;
+; CHECK-NEXT:    or.b64 %rd71, %rd69, %rd70;
 ; CHECK-NEXT:    add.s32 %r11, %r9, -64;
-; CHECK-NEXT:    shr.u64 %rd70, %rd42, %r11;
+; CHECK-NEXT:    shr.u64 %rd72, %rd42, %r11;
 ; CHECK-NEXT:    setp.gt.s32 %p16, %r9, 63;
-; CHECK-NEXT:    selp.b64 %rd99, %rd70, %rd69, %p16;
-; CHECK-NEXT:    shr.u64 %rd100, %rd42, %r9;
+; CHECK-NEXT:    selp.b64 %rd101, %rd72, %rd71, %p16;
+; CHECK-NEXT:    shr.u64 %rd102, %rd42, %r9;
 ; CHECK-NEXT:    add.cc.s64 %rd33, %rd43, -1;
 ; CHECK-NEXT:    addc.cc.s64 %rd34, %rd44, -1;
-; CHECK-NEXT:    mov.b64 %rd92, 0;
-; CHECK-NEXT:    mov.b64 %rd95, %rd92;
+; CHECK-NEXT:    mov.b64 %rd94, 0;
+; CHECK-NEXT:    mov.b64 %rd97, %rd94;
 ; CHECK-NEXT:  $L__BB5_2: // %udiv-do-while
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    shr.u64 %rd71, %rd99, 63;
-; CHECK-NEXT:    shl.b64 %rd72, %rd100, 1;
-; CHECK-NEXT:    or.b64 %rd73, %rd72, %rd71;
-; CHECK-NEXT:    shl.b64 %rd74, %rd99, 1;
-; CHECK-NEXT:    shr.u64 %rd75, %rd102, 63;
-; CHECK-NEXT:    or.b64 %rd76, %rd74, %rd75;
-; CHECK-NEXT:    shr.u64 %rd77, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd78, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd79, %rd78, %rd77;
-; CHECK-NEXT:    shl.b64 %rd80, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd101, %rd95, %rd80;
-; CHECK-NEXT:    or.b64 %rd102, %rd92, %rd79;
-; CHECK-NEXT:    sub.cc.s64 %rd81, %rd33, %rd76;
-; CHECK-NEXT:    subc.cc.s64 %rd82, %rd34, %rd73;
-; CHECK-NEXT:    shr.s64 %rd83, %rd82, 63;
-; CHECK-NEXT:    and.b64 %rd95, %rd83, 1;
-; CHECK-NEXT:    and.b64 %rd84, %rd83, %rd43;
-; CHECK-NEXT:    and.b64 %rd85, %rd83, %rd44;
-; CHECK-NEXT:    sub.cc.s64 %rd99, %rd76, %rd84;
-; CHECK-NEXT:    subc.cc.s64 %rd100, %rd73, %rd85;
-; CHECK-NEXT:    add.cc.s64 %rd97, %rd97, -1;
-; CHECK-NEXT:    addc.cc.s64 %rd98, %rd98, -1;
-; CHECK-NEXT:    or.b64 %rd86, %rd97, %rd98;
-; CHECK-NEXT:    setp.eq.b64 %p17, %rd86, 0;
+; CHECK-NEXT:    shr.u64 %rd73, %rd101, 63;
+; CHECK-NEXT:    shl.b64 %rd74, %rd102, 1;
+; CHECK-NEXT:    or.b64 %rd75, %rd74, %rd73;
+; CHECK-NEXT:    shl.b64 %rd76, %rd101, 1;
+; CHECK-NEXT:    shr.u64 %rd77, %rd104, 63;
+; CHECK-NEXT:    or.b64 %rd78, %rd76, %rd77;
+; CHECK-NEXT:    shr.u64 %rd79, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd80, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd81, %rd80, %rd79;
+; CHECK-NEXT:    shl.b64 %rd82, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd103, %rd97, %rd82;
+; CHECK-NEXT:    or.b64 %rd104, %rd94, %rd81;
+; CHECK-NEXT:    sub.cc.s64 %rd83, %rd33, %rd78;
+; CHECK-NEXT:    subc.cc.s64 %rd84, %rd34, %rd75;
+; CHECK-NEXT:    shr.s64 %rd85, %rd84, 63;
+; CHECK-NEXT:    and.b64 %rd97, %rd85, 1;
+; CHECK-NEXT:    and.b64 %rd86, %rd85, %rd43;
+; CHECK-NEXT:    and.b64 %rd87, %rd85, %rd44;
+; CHECK-NEXT:    sub.cc.s64 %rd101, %rd78, %rd86;
+; CHECK-NEXT:    subc.cc.s64 %rd102, %rd75, %rd87;
+; CHECK-NEXT:    add.cc.s64 %rd99, %rd99, -1;
+; CHECK-NEXT:    addc.cc.s64 %rd100, %rd100, -1;
+; CHECK-NEXT:    or.b64 %rd88, %rd99, %rd100;
+; CHECK-NEXT:    setp.eq.b64 %p17, %rd88, 0;
 ; CHECK-NEXT:    @%p17 bra $L__BB5_4;
 ; CHECK-NEXT:    bra.uni $L__BB5_2;
 ; CHECK-NEXT:  $L__BB5_4: // %udiv-loop-exit
-; CHECK-NEXT:    shr.u64 %rd87, %rd101, 63;
-; CHECK-NEXT:    shl.b64 %rd88, %rd102, 1;
-; CHECK-NEXT:    or.b64 %rd89, %rd88, %rd87;
-; CHECK-NEXT:    shl.b64 %rd90, %rd101, 1;
-; CHECK-NEXT:    or.b64 %rd103, %rd95, %rd90;
-; CHECK-NEXT:    or.b64 %rd104, %rd92, %rd89;
+; CHECK-NEXT:    shr.u64 %rd89, %rd103, 63;
+; CHECK-NEXT:    shl.b64 %rd90, %rd104, 1;
+; CHECK-NEXT:    or.b64 %rd91, %rd90, %rd89;
+; CHECK-NEXT:    shl.b64 %rd92, %rd103, 1;
+; CHECK-NEXT:    or.b64 %rd105, %rd97, %rd92;
+; CHECK-NEXT:    or.b64 %rd106, %rd94, %rd91;
 ; CHECK-NEXT:  $L__BB5_5: // %udiv-end
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd103, %rd104};
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd105, %rd106};
 ; CHECK-NEXT:    ret;
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
index 1a61498b1014..2b7a06c33d94 100644
--- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll
@@ -32,31 +32,57 @@ define <2 x i16> @test_ret_const() #0 {
 }
 
 define i16 @test_extract_0(<2 x i16> %a) #0 {
-; COMMON-LABEL: test_extract_0(
-; COMMON:       {
-; COMMON-NEXT:    .reg .b16 %rs<3>;
-; COMMON-NEXT:    .reg .b32 %r<3>;
-; COMMON-EMPTY:
-; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_0_param_0];
-; COMMON-NEXT:    cvt.u32.u16 %r2, %rs1;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
-; COMMON-NEXT:    ret;
+; I16x2-LABEL: test_extract_0(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b16 %rs<2>;
+; I16x2-NEXT:    .reg .b32 %r<3>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; I16x2-NEXT:    mov.b32 {%rs1, _}, %r1;
+; I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_extract_0(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<2>;
+; NO-I16x2-NEXT:    .reg .b32 %r<3>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_extract_0_param_0];
+; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
+; NO-I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; NO-I16x2-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 0
   ret i16 %e
 }
 
 define i16 @test_extract_1(<2 x i16> %a) #0 {
-; COMMON-LABEL: test_extract_1(
-; COMMON:       {
-; COMMON-NEXT:    .reg .b16 %rs<3>;
-; COMMON-NEXT:    .reg .b32 %r<3>;
-; COMMON-EMPTY:
-; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_1_param_0];
-; COMMON-NEXT:    cvt.u32.u16 %r2, %rs2;
-; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
-; COMMON-NEXT:    ret;
+; I16x2-LABEL: test_extract_1(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b16 %rs<2>;
+; I16x2-NEXT:    .reg .b32 %r<3>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; I16x2-NEXT:    mov.b32 {_, %rs1}, %r1;
+; I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_extract_1(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<2>;
+; NO-I16x2-NEXT:    .reg .b32 %r<3>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_extract_1_param_0];
+; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
+; NO-I16x2-NEXT:    cvt.u32.u16 %r2, %rs1;
+; NO-I16x2-NEXT:    st.param.b32 [func_retval0], %r2;
+; NO-I16x2-NEXT:    ret;
   %e = extractelement <2 x i16> %a, i32 1
   ret i16 %e
 }
@@ -71,8 +97,9 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.b64 %rd1, [test_extract_i_param_1];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_extract_i_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_extract_i_param_0];
 ; COMMON-NEXT:    setp.eq.b64 %p1, %rd1, 0;
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    selp.b16 %rs3, %rs1, %rs2, %p1;
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs3;
 ; COMMON-NEXT:    st.param.b32 [func_retval0], %r2;
@@ -99,10 +126,12 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_add_param_1];
-; NO-I16x2-NEXT:    add.s16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    add.s16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_add_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    add.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    add.s16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %r = add <2 x i16> %a, %b
@@ -128,7 +157,8 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_0_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_0_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -155,7 +185,8 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<2>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_add_imm_1_param_0];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_add_imm_1_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; NO-I16x2-NEXT:    add.s16 %rs3, %rs2, 2;
 ; NO-I16x2-NEXT:    add.s16 %rs4, %rs1, 1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -171,10 +202,12 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_sub_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_sub_param_1];
-; COMMON-NEXT:    sub.s16 %rs5, %rs2, %rs4;
-; COMMON-NEXT:    sub.s16 %rs6, %rs1, %rs3;
+; COMMON-NEXT:    ld.param.b32 %r2, [test_sub_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_sub_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    sub.s16 %rs5, %rs4, %rs2;
+; COMMON-NEXT:    sub.s16 %rs6, %rs3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
   %r = sub <2 x i16> %a, %b
@@ -199,10 +232,12 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_smax_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_smax_param_1];
-; NO-I16x2-NEXT:    max.s16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    max.s16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smax_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smax_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    max.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    max.s16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp sgt <2 x i16> %a, %b
@@ -228,10 +263,12 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_umax_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_umax_param_1];
-; NO-I16x2-NEXT:    max.u16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    max.u16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umax_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umax_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    max.u16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    max.u16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp ugt <2 x i16> %a, %b
@@ -257,10 +294,12 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_smin_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_smin_param_1];
-; NO-I16x2-NEXT:    min.s16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    min.s16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_smin_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_smin_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    min.s16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    min.s16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp sle <2 x i16> %a, %b
@@ -286,10 +325,12 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
 ; NO-I16x2-NEXT:    .reg .b32 %r<3>;
 ; NO-I16x2-EMPTY:
 ; NO-I16x2-NEXT:  // %bb.0:
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_umin_param_0];
-; NO-I16x2-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_umin_param_1];
-; NO-I16x2-NEXT:    min.u16 %rs5, %rs2, %rs4;
-; NO-I16x2-NEXT:    min.u16 %rs6, %rs1, %rs3;
+; NO-I16x2-NEXT:    ld.param.b32 %r2, [test_umin_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_umin_param_0];
+; NO-I16x2-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; NO-I16x2-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; NO-I16x2-NEXT:    min.u16 %rs5, %rs4, %rs2;
+; NO-I16x2-NEXT:    min.u16 %rs6, %rs3, %rs1;
 ; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; NO-I16x2-NEXT:    ret;
   %cmp = icmp ule <2 x i16> %a, %b
@@ -304,10 +345,12 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_mul_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_mul_param_1];
-; COMMON-NEXT:    mul.lo.s16 %rs5, %rs2, %rs4;
-; COMMON-NEXT:    mul.lo.s16 %rs6, %rs1, %rs3;
+; COMMON-NEXT:    ld.param.b32 %r2, [test_mul_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_mul_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    mul.lo.s16 %rs5, %rs4, %rs2;
+; COMMON-NEXT:    mul.lo.s16 %rs6, %rs3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
   %r = mul <2 x i16> %a, %b
@@ -686,14 +729,18 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
 ; COMMON-NEXT:    .reg .b32 %r<5>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_param_2];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs5, %rs6}, [test_select_cc_param_3];
-; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs5;
-; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs6;
-; COMMON-NEXT:    ld.param.v2.b16 {%rs7, %rs8}, [test_select_cc_param_1];
-; COMMON-NEXT:    selp.b16 %rs9, %rs2, %rs8, %p2;
-; COMMON-NEXT:    selp.b16 %rs10, %rs1, %rs7, %p1;
+; COMMON-NEXT:    ld.param.b32 %r4, [test_select_cc_param_3];
+; COMMON-NEXT:    ld.param.b32 %r3, [test_select_cc_param_2];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r4;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r3;
+; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs1;
+; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs2;
+; COMMON-NEXT:    mov.b32 {%rs5, %rs6}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; COMMON-NEXT:    selp.b16 %rs9, %rs8, %rs6, %p2;
+; COMMON-NEXT:    selp.b16 %rs10, %rs7, %rs5, %p1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs10, %rs9};
 ; COMMON-NEXT:    ret;
   %cc = icmp ne <2 x i16> %c, %d
@@ -711,10 +758,12 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
 ; COMMON-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i32_i16_param_2];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i32_i16_param_3];
-; COMMON-NEXT:    setp.ne.b16 %p1, %rs1, %rs3;
-; COMMON-NEXT:    setp.ne.b16 %p2, %rs2, %rs4;
+; COMMON-NEXT:    ld.param.b32 %r6, [test_select_cc_i32_i16_param_3];
+; COMMON-NEXT:    ld.param.b32 %r5, [test_select_cc_i32_i16_param_2];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r6;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r5;
+; COMMON-NEXT:    setp.ne.b16 %p1, %rs3, %rs1;
+; COMMON-NEXT:    setp.ne.b16 %p2, %rs4, %rs2;
 ; COMMON-NEXT:    selp.b32 %r7, %r2, %r4, %p2;
 ; COMMON-NEXT:    selp.b32 %r8, %r1, %r3, %p1;
 ; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r8, %r7};
@@ -735,12 +784,14 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
 ; COMMON-NEXT:  // %bb.0:
 ; COMMON-NEXT:    ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
 ; COMMON-NEXT:    ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_select_cc_i16_i32_param_0];
+; COMMON-NEXT:    ld.param.b32 %r2, [test_select_cc_i16_i32_param_1];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_select_cc_i16_i32_param_0];
 ; COMMON-NEXT:    setp.ne.b32 %p1, %r3, %r5;
 ; COMMON-NEXT:    setp.ne.b32 %p2, %r4, %r6;
-; COMMON-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_select_cc_i16_i32_param_1];
-; COMMON-NEXT:    selp.b16 %rs5, %rs2, %rs4, %p2;
-; COMMON-NEXT:    selp.b16 %rs6, %rs1, %rs3, %p1;
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; COMMON-NEXT:    mov.b32 {%rs3, %rs4}, %r1;
+; COMMON-NEXT:    selp.b16 %rs5, %rs4, %rs2, %p2;
+; COMMON-NEXT:    selp.b16 %rs6, %rs3, %rs1, %p1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs6, %rs5};
 ; COMMON-NEXT:    ret;
                                           <2 x i32> %c, <2 x i32> %d) #0 {
@@ -851,7 +902,8 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<4>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi32_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi32_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; COMMON-NEXT:    cvt.u32.u16 %r3, %rs1;
 ; COMMON-NEXT:    st.param.v2.b32 [func_retval0], {%r3, %r2};
@@ -868,7 +920,8 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b64 %rd<3>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_zext_2xi64_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_zext_2xi64_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.u64.u16 %rd1, %rs2;
 ; COMMON-NEXT:    cvt.u64.u16 %rd2, %rs1;
 ; COMMON-NEXT:    st.param.v2.b64 [func_retval0], {%rd2, %rd1};
@@ -926,7 +979,8 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_shufflevector_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_shufflevector_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
 ; COMMON-NEXT:    ret;
   %s = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
@@ -934,16 +988,29 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
 }
 
 define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
-; COMMON-LABEL: test_insertelement(
-; COMMON:       {
-; COMMON-NEXT:    .reg .b16 %rs<4>;
-; COMMON-NEXT:    .reg .b32 %r<2>;
-; COMMON-EMPTY:
-; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
-; COMMON-NEXT:    ld.param.v2.b16 {%rs2, %rs3}, [test_insertelement_param_0];
-; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
-; COMMON-NEXT:    ret;
+; I16x2-LABEL: test_insertelement(
+; I16x2:       {
+; I16x2-NEXT:    .reg .b16 %rs<3>;
+; I16x2-NEXT:    .reg .b32 %r<2>;
+; I16x2-EMPTY:
+; I16x2-NEXT:  // %bb.0:
+; I16x2-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; I16x2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; I16x2-NEXT:    mov.b32 {%rs2, _}, %r1;
+; I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
+; I16x2-NEXT:    ret;
+;
+; NO-I16x2-LABEL: test_insertelement(
+; NO-I16x2:       {
+; NO-I16x2-NEXT:    .reg .b16 %rs<3>;
+; NO-I16x2-NEXT:    .reg .b32 %r<2>;
+; NO-I16x2-EMPTY:
+; NO-I16x2-NEXT:  // %bb.0:
+; NO-I16x2-NEXT:    ld.param.b16 %rs1, [test_insertelement_param_1];
+; NO-I16x2-NEXT:    ld.param.b32 %r1, [test_insertelement_param_0];
+; NO-I16x2-NEXT:    { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
+; NO-I16x2-NEXT:    st.param.v2.b16 [func_retval0], {%rs2, %rs1};
+; NO-I16x2-NEXT:    ret;
   %i = insertelement <2 x i16> %a, i16 %x, i64 1
   ret <2 x i16> %i
 }
@@ -955,7 +1022,8 @@ define <2 x i16> @test_fptosi_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptosi_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_fptosi_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
 ; COMMON-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
@@ -971,7 +1039,8 @@ define <2 x i16> @test_fptoui_2xhalf_to_2xi16(<2 x half> %a) #0 {
 ; COMMON-NEXT:    .reg .b32 %r<2>;
 ; COMMON-EMPTY:
 ; COMMON-NEXT:  // %bb.0:
-; COMMON-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fptoui_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    ld.param.b32 %r1, [test_fptoui_2xhalf_to_2xi16_param_0];
+; COMMON-NEXT:    mov.b32 {%rs1, %rs2}, %r1;
 ; COMMON-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
 ; COMMON-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
 ; COMMON-NEXT:    st.param.v2.b16 [func_retval0], {%rs4, %rs3};
diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
index 410c0019c722..6a76cd15c321 100644
--- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll
@@ -1220,16 +1220,18 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptosi_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs5, %rs4;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs2;
-; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs1;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs9, %rs8;
+; CHECK-NEXT:    cvt.rzi.s16.f16 %rs10, %rs7;
 ; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
@@ -1249,16 +1251,18 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
 ; CHECK-NEXT:    .reg .b32 %r<12>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [test_fptoui_4xhalf_to_4xi8_param_0];
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs5, %rs4;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs6, %rs3;
-; CHECK-NEXT:    mov.b32 %r3, {%rs6, %rs5};
-; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r3;
-; CHECK-NEXT:    cvt.u32.u16 %r4, %rs8;
-; CHECK-NEXT:    cvt.u32.u16 %r5, %rs7;
+; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
+; CHECK-NEXT:    mov.b32 {%rs1, %rs2}, %r2;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs3, %rs2;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs4, %rs1;
+; CHECK-NEXT:    mov.b32 %r3, {%rs4, %rs3};
+; CHECK-NEXT:    mov.b32 {%rs5, %rs6}, %r3;
+; CHECK-NEXT:    cvt.u32.u16 %r4, %rs6;
+; CHECK-NEXT:    cvt.u32.u16 %r5, %rs5;
 ; CHECK-NEXT:    prmt.b32 %r6, %r5, %r4, 0x3340U;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs2;
-; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs1;
+; CHECK-NEXT:    mov.b32 {%rs7, %rs8}, %r1;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs9, %rs8;
+; CHECK-NEXT:    cvt.rzi.u16.f16 %rs10, %rs7;
 ; CHECK-NEXT:    mov.b32 %r7, {%rs10, %rs9};
 ; CHECK-NEXT:    mov.b32 {%rs11, %rs12}, %r7;
 ; CHECK-NEXT:    cvt.u32.u16 %r8, %rs12;
diff --git a/llvm/test/CodeGen/NVPTX/pr126337.ll b/llvm/test/CodeGen/NVPTX/pr126337.ll
new file mode 100644
index 000000000000..32e411584b0e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/pr126337.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 | %ptxas-verify %}
+
+; This IR should compile without triggering assertions in LICM
+; when the CopyToReg from %0 in the first BB gets eliminated
+; but we still use its result in the second BB.
+; Technically the problem happens in MIR, but there are multiple
+; passes involved, so testing with the IR reproducer is more convenient.
+; https://github.com/llvm/llvm-project/pull/126337#issuecomment-3081431594
+
+target datalayout = "e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+define ptx_kernel void @Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(<2 x float> %0) {
+; CHECK-LABEL: Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<2>;
+; CHECK-NEXT:    .reg .b16 %rs<2>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0: // %.preheader15
+; CHECK-NEXT:    ld.param.b64 %rd1, [Equal_GPU_DT_COMPLEX64_DT_BOOL_kernel_param_0];
+; CHECK-NEXT:    { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
+; CHECK-NEXT:    setp.eq.f32 %p1, %r1, 0f00000000;
+; CHECK-NEXT:    selp.b16 %rs1, 1, 0, %p1;
+; CHECK-NEXT:  $L__BB0_1: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    mov.b64 %rd2, 0;
+; CHECK-NEXT:    st.b8 [%rd2], %rs1;
+; CHECK-NEXT:    bra.uni $L__BB0_1;
+.preheader15:
+  br label %1
+
+1:                                                ; preds = %1, %.preheader15
+  %2 = fcmp oeq <2 x float> %0, zeroinitializer
+  %3 = extractelement <2 x i1> %2, i64 0
+  store i1 %3, ptr null, align 4
+  br label %1
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
index 87f965c84b6b..92cb51b17f0c 100644
--- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
+++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll
@@ -117,16 +117,20 @@ define float @reduce_fadd_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_param_0];
-; CHECK-NEXT:    add.rn.f32 %r9, %r5, 0f00000000;
-; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r6;
-; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r7;
-; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r8;
-; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r1;
-; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r2;
-; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r3;
-; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r4;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_param_0+16];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT:    add.rn.f32 %r9, %r7, 0f00000000;
+; CHECK-NEXT:    add.rn.f32 %r10, %r9, %r8;
+; CHECK-NEXT:    add.rn.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    add.rn.f32 %r12, %r11, %r6;
+; CHECK-NEXT:    add.rn.f32 %r13, %r12, %r3;
+; CHECK-NEXT:    add.rn.f32 %r14, %r13, %r4;
+; CHECK-NEXT:    add.rn.f32 %r15, %r14, %r1;
+; CHECK-NEXT:    add.rn.f32 %r16, %r15, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r16;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in)
@@ -140,14 +144,18 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) {
 ; CHECK-SM80-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fadd_float_reassoc_param_0];
-; CHECK-SM80-NEXT:    add.rn.f32 %r9, %r7, %r3;
-; CHECK-SM80-NEXT:    add.rn.f32 %r10, %r5, %r1;
-; CHECK-SM80-NEXT:    add.rn.f32 %r11, %r8, %r4;
-; CHECK-SM80-NEXT:    add.rn.f32 %r12, %r6, %r2;
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fadd_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fadd_float_reassoc_param_0];
+; CHECK-SM80-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM80-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM80-NEXT:    add.rn.f32 %r5, %r3, %r1;
+; CHECK-SM80-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM80-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM80-NEXT:    add.rn.f32 %r10, %r8, %r6;
+; CHECK-SM80-NEXT:    add.rn.f32 %r11, %r4, %r2;
+; CHECK-SM80-NEXT:    add.rn.f32 %r12, %r9, %r7;
 ; CHECK-SM80-NEXT:    add.rn.f32 %r13, %r12, %r11;
-; CHECK-SM80-NEXT:    add.rn.f32 %r14, %r10, %r9;
+; CHECK-SM80-NEXT:    add.rn.f32 %r14, %r10, %r5;
 ; CHECK-SM80-NEXT:    add.rn.f32 %r15, %r14, %r13;
 ; CHECK-SM80-NEXT:    add.rn.f32 %r16, %r15, 0f00000000;
 ; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r16;
@@ -321,15 +329,19 @@ define float @reduce_fmul_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_param_0];
-; CHECK-NEXT:    mul.rn.f32 %r9, %r5, %r6;
-; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r7;
-; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r8;
-; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r1;
-; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r2;
-; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r3;
-; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r4;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_param_0+16];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd3;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r5, %r6}, %rd2;
+; CHECK-NEXT:    mov.b64 {%r7, %r8}, %rd1;
+; CHECK-NEXT:    mul.rn.f32 %r9, %r7, %r8;
+; CHECK-NEXT:    mul.rn.f32 %r10, %r9, %r5;
+; CHECK-NEXT:    mul.rn.f32 %r11, %r10, %r6;
+; CHECK-NEXT:    mul.rn.f32 %r12, %r11, %r3;
+; CHECK-NEXT:    mul.rn.f32 %r13, %r12, %r4;
+; CHECK-NEXT:    mul.rn.f32 %r14, %r13, %r1;
+; CHECK-NEXT:    mul.rn.f32 %r15, %r14, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-NEXT:    ret;
   %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in)
@@ -343,14 +355,18 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) {
 ; CHECK-SM80-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-SM80-EMPTY:
 ; CHECK-SM80-NEXT:  // %bb.0:
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_float_reassoc_param_0+16];
-; CHECK-SM80-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmul_float_reassoc_param_0];
-; CHECK-SM80-NEXT:    mul.rn.f32 %r9, %r7, %r3;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r10, %r5, %r1;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r11, %r8, %r4;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r12, %r6, %r2;
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmul_float_reassoc_param_0+16];
+; CHECK-SM80-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmul_float_reassoc_param_0];
+; CHECK-SM80-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-SM80-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r5, %r3, %r1;
+; CHECK-SM80-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-SM80-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r10, %r8, %r6;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r11, %r4, %r2;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r12, %r9, %r7;
 ; CHECK-SM80-NEXT:    mul.rn.f32 %r13, %r12, %r11;
-; CHECK-SM80-NEXT:    mul.rn.f32 %r14, %r10, %r9;
+; CHECK-SM80-NEXT:    mul.rn.f32 %r14, %r10, %r5;
 ; CHECK-SM80-NEXT:    mul.rn.f32 %r15, %r14, %r13;
 ; CHECK-SM80-NEXT:    st.param.b32 [func_retval0], %r15;
 ; CHECK-SM80-NEXT:    ret;
@@ -494,13 +510,17 @@ define float @reduce_fmax_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_param_0];
-; CHECK-NEXT:    max.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -517,13 +537,17 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmax_float_reassoc_param_0];
-; CHECK-NEXT:    max.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmax_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmax_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -628,13 +652,17 @@ define float @reduce_fmin_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_param_0];
-; CHECK-NEXT:    min.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -651,13 +679,17 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmin_float_reassoc_param_0];
-; CHECK-NEXT:    min.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmin_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmin_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -762,13 +794,17 @@ define float @reduce_fmaximum_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_param_0];
-; CHECK-NEXT:    max.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -785,13 +821,17 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fmaximum_float_reassoc_param_0];
-; CHECK-NEXT:    max.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    max.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    max.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    max.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fmaximum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fmaximum_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    max.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    max.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    max.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    max.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    max.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    max.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    max.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -896,13 +936,17 @@ define float @reduce_fminimum_float(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_param_0];
-; CHECK-NEXT:    min.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
@@ -919,13 +963,17 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) {
 ; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_float_reassoc_param_0+16];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_fminimum_float_reassoc_param_0];
-; CHECK-NEXT:    min.NaN.f32 %r9, %r8, %r4;
-; CHECK-NEXT:    min.NaN.f32 %r10, %r6, %r2;
-; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r9;
-; CHECK-NEXT:    min.NaN.f32 %r12, %r7, %r3;
-; CHECK-NEXT:    min.NaN.f32 %r13, %r5, %r1;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [reduce_fminimum_float_reassoc_param_0+16];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [reduce_fminimum_float_reassoc_param_0];
+; CHECK-NEXT:    mov.b64 {%r1, %r2}, %rd4;
+; CHECK-NEXT:    mov.b64 {%r3, %r4}, %rd2;
+; CHECK-NEXT:    min.NaN.f32 %r5, %r4, %r2;
+; CHECK-NEXT:    mov.b64 {%r6, %r7}, %rd3;
+; CHECK-NEXT:    mov.b64 {%r8, %r9}, %rd1;
+; CHECK-NEXT:    min.NaN.f32 %r10, %r9, %r7;
+; CHECK-NEXT:    min.NaN.f32 %r11, %r10, %r5;
+; CHECK-NEXT:    min.NaN.f32 %r12, %r3, %r1;
+; CHECK-NEXT:    min.NaN.f32 %r13, %r8, %r6;
 ; CHECK-NEXT:    min.NaN.f32 %r14, %r13, %r12;
 ; CHECK-NEXT:    min.NaN.f32 %r15, %r14, %r11;
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r15;
diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
index 41e21248a3f0..2796cdb3ae87 100644
--- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
+++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
@@ -1,6 +1,12 @@
 # RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -start-after \
 # RUN:   virtregrewriter -ppc-asm-full-reg-names -verify-machineinstrs %s \
 # RUN:   -o - | FileCheck %s
+# RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu -start-after \
+# RUN:   virtregrewriter -ppc-asm-full-reg-names -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+# RUN: llc -mcpu=pwr10 -mtriple=powerpc64le-unknown-linux-gnu -start-after \
+# RUN:   virtregrewriter -ppc-asm-full-reg-names -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
 
 --- |
   ; ModuleID = 'a.ll'
@@ -30,7 +36,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #1
   
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
   attributes #1 = { nounwind }
   
   !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/aix-lower-arbitrary-sized-ints.ll b/llvm/test/CodeGen/PowerPC/aix-lower-arbitrary-sized-ints.ll
new file mode 100644
index 000000000000..c119da6e050a
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-lower-arbitrary-sized-ints.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --verify-machineinstrs -mtriple powerpc-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s --check-prefixes=CHECK,CHECK32
+; RUN: llc --verify-machineinstrs -mtriple powerpc64-ibm-aix-xcoff \
+; RUN:   -ppc-asm-full-reg-names -mcpu=pwr8 < %s | \
+; RUN: FileCheck %s --check-prefixes=CHECK,CHECK64
+
+define ptr @lower_args(ptr %_0, i32 %0, i32 %1, i32 %2, i32 %3, ptr %4, ptr %5, i64 %6, i24 %7) {
+; CHECK-LABEL: lower_args:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  ret ptr %_0
+}
+
+define i32 @lower_args_withops_zeroext(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i24 %i) {
+; CHECK32-LABEL: lower_args_withops_zeroext:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    lwz r3, 56(r1)
+; CHECK32-NEXT:    addi r3, r3, 255
+; CHECK32-NEXT:    clrlwi r3, r3, 8
+; CHECK32-NEXT:    blr
+;
+; CHECK64-LABEL: lower_args_withops_zeroext:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    lwz r3, 116(r1)
+; CHECK64-NEXT:    addi r3, r3, 255
+; CHECK64-NEXT:    clrldi r3, r3, 40
+; CHECK64-NEXT:    blr
+entry:
+  %0 = add i24 %i, 255
+  %1 = zext i24 %0 to i32
+  ret i32 %1
+}
+
+define i32 @lower_args_withops_signext(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h, i24 signext %i) {
+; CHECK32-LABEL: lower_args_withops_signext:
+; CHECK32:       # %bb.0: # %entry
+; CHECK32-NEXT:    lwz r3, 56(r1)
+; CHECK32-NEXT:    slwi r3, r3, 8
+; CHECK32-NEXT:    srawi r3, r3, 8
+; CHECK32-NEXT:    slwi r3, r3, 8
+; CHECK32-NEXT:    addi r3, r3, 22272
+; CHECK32-NEXT:    srawi r3, r3, 8
+; CHECK32-NEXT:    blr
+;
+; CHECK64-LABEL: lower_args_withops_signext:
+; CHECK64:       # %bb.0: # %entry
+; CHECK64-NEXT:    lwz r3, 116(r1)
+; CHECK64-NEXT:    slwi r3, r3, 8
+; CHECK64-NEXT:    srawi r3, r3, 8
+; CHECK64-NEXT:    addi r3, r3, 87
+; CHECK64-NEXT:    sldi r3, r3, 40
+; CHECK64-NEXT:    sradi r3, r3, 40
+; CHECK64-NEXT:    blr
+entry:
+  %0 = add i24 %i, 87
+  %1 = sext i24 %0 to i32
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
new file mode 100644
index 000000000000..238e200bfc78
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/froundeven-legalization.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64le < %s | FileCheck %s
+
+define void @test(ptr %p1, ptr %p2) nounwind {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    stdu 1, -224(1)
+; CHECK-NEXT:    li 5, 48
+; CHECK-NEXT:    std 0, 240(1)
+; CHECK-NEXT:    std 27, 184(1) # 8-byte Folded Spill
+; CHECK-NEXT:    li 27, 16
+; CHECK-NEXT:    std 28, 192(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 29, 200(1) # 8-byte Folded Spill
+; CHECK-NEXT:    li 29, 32
+; CHECK-NEXT:    li 28, 48
+; CHECK-NEXT:    stxvd2x 56, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 64
+; CHECK-NEXT:    std 30, 208(1) # 8-byte Folded Spill
+; CHECK-NEXT:    mr 30, 4
+; CHECK-NEXT:    stxvd2x 57, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 80
+; CHECK-NEXT:    stxvd2x 58, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 96
+; CHECK-NEXT:    lxvd2x 58, 0, 3
+; CHECK-NEXT:    stxvd2x 59, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 112
+; CHECK-NEXT:    lxvd2x 59, 3, 27
+; CHECK-NEXT:    stxvd2x 60, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 128
+; CHECK-NEXT:    stxvd2x 61, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 144
+; CHECK-NEXT:    stxvd2x 62, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    li 5, 160
+; CHECK-NEXT:    lxvd2x 62, 3, 28
+; CHECK-NEXT:    stxvd2x 63, 1, 5 # 16-byte Folded Spill
+; CHECK-NEXT:    lxvd2x 63, 3, 29
+; CHECK-NEXT:    xxswapd 57, 58
+; CHECK-NEXT:    xxswapd 1, 59
+; CHECK-NEXT:    xxswapd 60, 62
+; CHECK-NEXT:    xxswapd 61, 63
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 56, 1
+; CHECK-NEXT:    xxlor 1, 59, 59
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 60, 60
+; CHECK-NEXT:    xxmrgld 59, 0, 56
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 60, 1
+; CHECK-NEXT:    xxlor 1, 62, 62
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 61, 61
+; CHECK-NEXT:    xxmrgld 62, 0, 60
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 61, 1
+; CHECK-NEXT:    xxlor 1, 63, 63
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    xxlor 1, 57, 57
+; CHECK-NEXT:    xxmrgld 63, 0, 61
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    xxswapd 61, 1
+; CHECK-NEXT:    xxlor 1, 58, 58
+; CHECK-NEXT:    bl roundeven
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    li 3, 160
+; CHECK-NEXT:    stxvd2x 63, 30, 29
+; CHECK-NEXT:    xxswapd 0, 1
+; CHECK-NEXT:    stxvd2x 62, 30, 28
+; CHECK-NEXT:    stxvd2x 59, 30, 27
+; CHECK-NEXT:    ld 29, 200(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 28, 192(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 27, 184(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 144
+; CHECK-NEXT:    xxmrgld 0, 0, 61
+; CHECK-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 128
+; CHECK-NEXT:    stxvd2x 0, 0, 30
+; CHECK-NEXT:    ld 30, 208(1) # 8-byte Folded Reload
+; CHECK-NEXT:    lxvd2x 61, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 112
+; CHECK-NEXT:    lxvd2x 60, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 96
+; CHECK-NEXT:    lxvd2x 59, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 80
+; CHECK-NEXT:    lxvd2x 58, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 64
+; CHECK-NEXT:    lxvd2x 57, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    li 3, 48
+; CHECK-NEXT:    lxvd2x 56, 1, 3 # 16-byte Folded Reload
+; CHECK-NEXT:    addi 1, 1, 224
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+  %v = load <8 x double>, ptr %p1, align 64
+  %res = call <8 x double> @llvm.roundeven.v8f64(<8 x double> %v)
+  store <8 x double> %res, ptr %p2, align 64
+  ret void
+}
+
+declare <8 x double> @llvm.roundeven.v8f64(<8 x double>)
diff --git a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
index 232014db9a01..a9503f77c309 100644
--- a/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
+++ b/llvm/test/CodeGen/PowerPC/mtvsrbmi.ll
@@ -2,22 +2,87 @@
 ; Verify whether the generated assembly for the following function includes the mtvsrbmi instruction.
 ; vector unsigned char v00FF()
 ; {
-; vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
-; return x;
+;   vector unsigned char x = { 0xFF, 0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 };
+;   return x;
+; }
+; vector unsigned short short00FF()
+; {
+;   vector unsigned short x = { 0xFF, 0,0,0, 0,0,0,0};
+;   return x;
+; }
+; vector unsigned int int00FF()
+; {
+;   vector unsigned int x = { 0xFF, 0,0,0};
+;   return x;
+; }
+; vector unsigned long long  longlong00FF()
+; {
+;   vector unsigned long long x = { 0xFF, 0};
+;   return x;
 ; }
 
 ; RUN: llc < %s -ppc-asm-full-reg-names  -mtriple=powerpc-ibm-aix -mcpu=pwr10  -verify-machineinstrs \
-; RUN:   | FileCheck %s --check-prefix=CHECK
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-BE
+
+; RUN: llc < %s -ppc-asm-full-reg-names  -mtriple=powerpc64le-unknown-gnu-linux -mcpu=pwr10  -verify-machineinstrs \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+
+; CHECK-NOT:   .byte   255
+; CHECK-NOT:   .byte   0
 
 define dso_local noundef range(i8 -1, 1) <16 x i8> @_Z5v00FFv() {
-; CHECK-NOT:      L..CPI0_0:
-; CHECK-NOT:   .byte   255                             # 0xff
-; CHECK-NOT:   .byte   0                               # 0x0
-
-; CHECK-LABEL: _Z5v00FFv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    mtvsrbmi v2, 1
-; CHECK-NEXT:    blr
+; CHECK-BE-LABEL: _Z5v00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 32768
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z5v00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+
 entry:
   ret <16 x i8> <i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
 }
+
+define dso_local noundef range(i16 0, 256) <8 x i16> @_Z9short00FFv() {
+; CHECK-BE-LABEL: _Z9short00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 16384
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z9short00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+entry:
+	  ret <8 x i16> <i16 255, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+}
+
+define dso_local noundef range(i32 0, 256) <4 x i32> @_Z7int00FFv() {
+; CHECK-BE-LABEL: _Z7int00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 4096
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z7int00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+entry:
+	  ret <4 x i32> <i32 255, i32 0, i32 0, i32 0>
+}
+
+define dso_local noundef range(i64 0, 256) <2 x i64> @_Z12longlong00FFv() {
+; CHECK-BE-LABEL: _Z12longlong00FFv:
+; CHECK-BE:       # %bb.0: # %entry
+; CHECK-BE-NEXT:    mtvsrbmi v2, 256
+; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-LABEL: _Z12longlong00FFv:
+; CHECK-LE:       # %bb.0: # %entry
+; CHECK-LE-NEXT:    mtvsrbmi v2, 1
+; CHECK-LE-NEXT:    blr
+entry:
+	  ret <2 x i64> <i64 255, i64 0>
+}
diff --git a/llvm/test/CodeGen/PowerPC/nofpclass.ll b/llvm/test/CodeGen/PowerPC/nofpclass.ll
new file mode 100644
index 000000000000..b08e810cd1cc
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/nofpclass.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-ibm-aix-xcoff < %s | FileCheck %s
+
+; TODO: Update this test after adding the proper expansion of nofpclass for
+; ppc_fp128 to test with more masks and to demonstrate preserving nofpclass
+; after legalization.
+
+define ppc_fp128 @f(ppc_fp128 nofpclass(nan) %s) {
+; CHECK-LABEL: f:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    blr
+entry:
+  ret ppc_fp128 %s
+}
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
index 821cfd00dcd0..b540948b20f7 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@@ -764,8 +764,13 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ;
 ; CHECK-PWR7-LABEL: sub_absv_8_ext:
 ; CHECK-PWR7:       # %bb.0: # %entry
-; CHECK-PWR7-NEXT:    stdu r1, -448(r1)
-; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 448
+; CHECK-PWR7-NEXT:    stdu r1, -512(r1)
+; CHECK-PWR7-NEXT:    .cfi_def_cfa_offset 512
+; CHECK-PWR7-NEXT:    .cfi_offset r14, -144
+; CHECK-PWR7-NEXT:    .cfi_offset r15, -136
+; CHECK-PWR7-NEXT:    .cfi_offset r16, -128
+; CHECK-PWR7-NEXT:    .cfi_offset r17, -120
+; CHECK-PWR7-NEXT:    .cfi_offset r18, -112
 ; CHECK-PWR7-NEXT:    .cfi_offset r19, -104
 ; CHECK-PWR7-NEXT:    .cfi_offset r20, -96
 ; CHECK-PWR7-NEXT:    .cfi_offset r21, -88
@@ -778,258 +783,244 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr
 ; CHECK-PWR7-NEXT:    .cfi_offset r28, -32
 ; CHECK-PWR7-NEXT:    .cfi_offset r29, -24
 ; CHECK-PWR7-NEXT:    .cfi_offset r30, -16
-; CHECK-PWR7-NEXT:    addi r3, r1, 304
-; CHECK-PWR7-NEXT:    std r19, 344(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r20, 352(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r21, 360(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r22, 368(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r23, 376(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r24, 384(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r25, 392(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r26, 400(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r27, 408(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r28, 416(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r29, 424(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    std r30, 432(r1) # 8-byte Folded Spill
-; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    .cfi_offset r31, -8
+; CHECK-PWR7-NEXT:    .cfi_offset r2, -152
 ; CHECK-PWR7-NEXT:    addi r3, r1, 320
-; CHECK-PWR7-NEXT:    lbz r7, 304(r1)
-; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    lbz r8, 320(r1)
-; CHECK-PWR7-NEXT:    lbz r9, 305(r1)
-; CHECK-PWR7-NEXT:    lbz r10, 321(r1)
-; CHECK-PWR7-NEXT:    lbz r26, 325(r1)
-; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT:    clrlwi r8, r8, 24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT:    lbz r11, 306(r1)
-; CHECK-PWR7-NEXT:    lbz r12, 322(r1)
-; CHECK-PWR7-NEXT:    lbz r23, 314(r1)
-; CHECK-PWR7-NEXT:    clrlwi r22, r26, 24
-; CHECK-PWR7-NEXT:    lbz r26, 330(r1)
-; CHECK-PWR7-NEXT:    sub r8, r7, r8
-; CHECK-PWR7-NEXT:    lbz r7, 315(r1)
-; CHECK-PWR7-NEXT:    sub r20, r9, r10
-; CHECK-PWR7-NEXT:    lbz r9, 331(r1)
-; CHECK-PWR7-NEXT:    lbz r0, 307(r1)
-; CHECK-PWR7-NEXT:    lbz r30, 323(r1)
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r12, r12, 24
-; CHECK-PWR7-NEXT:    clrlwi r23, r23, 24
-; CHECK-PWR7-NEXT:    clrlwi r21, r26, 24
-; CHECK-PWR7-NEXT:    clrlwi r7, r7, 24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r0, r0, 24
-; CHECK-PWR7-NEXT:    clrlwi r30, r30, 24
-; CHECK-PWR7-NEXT:    lbz r29, 308(r1)
-; CHECK-PWR7-NEXT:    lbz r28, 324(r1)
-; CHECK-PWR7-NEXT:    lbz r27, 309(r1)
-; CHECK-PWR7-NEXT:    lbz r25, 310(r1)
-; CHECK-PWR7-NEXT:    lbz r24, 326(r1)
-; CHECK-PWR7-NEXT:    sub r19, r11, r12
-; CHECK-PWR7-NEXT:    sub r11, r23, r21
-; CHECK-PWR7-NEXT:    sub r9, r7, r9
-; CHECK-PWR7-NEXT:    sub r26, r0, r30
-; CHECK-PWR7-NEXT:    srawi r12, r11, 31
-; CHECK-PWR7-NEXT:    srawi r0, r9, 31
-; CHECK-PWR7-NEXT:    lbz r3, 312(r1)
-; CHECK-PWR7-NEXT:    clrlwi r29, r29, 24
-; CHECK-PWR7-NEXT:    clrlwi r28, r28, 24
-; CHECK-PWR7-NEXT:    clrlwi r27, r27, 24
-; CHECK-PWR7-NEXT:    clrlwi r25, r25, 24
-; CHECK-PWR7-NEXT:    clrlwi r24, r24, 24
-; CHECK-PWR7-NEXT:    xor r11, r11, r12
-; CHECK-PWR7-NEXT:    xor r9, r9, r0
-; CHECK-PWR7-NEXT:    sub r28, r29, r28
-; CHECK-PWR7-NEXT:    sub r30, r27, r22
-; CHECK-PWR7-NEXT:    sub r29, r25, r24
-; CHECK-PWR7-NEXT:    sub r27, r11, r12
-; CHECK-PWR7-NEXT:    sub r24, r9, r0
-; CHECK-PWR7-NEXT:    lbz r9, 316(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 332(r1)
-; CHECK-PWR7-NEXT:    lbz r4, 328(r1)
-; CHECK-PWR7-NEXT:    lbz r5, 311(r1)
-; CHECK-PWR7-NEXT:    lbz r6, 327(r1)
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r3, r3, 24
-; CHECK-PWR7-NEXT:    clrlwi r4, r4, 24
-; CHECK-PWR7-NEXT:    clrlwi r5, r5, 24
-; CHECK-PWR7-NEXT:    clrlwi r6, r6, 24
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
+; CHECK-PWR7-NEXT:    std r14, 368(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r15, 376(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r16, 384(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r17, 392(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r18, 400(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r19, 408(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r20, 416(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r21, 424(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r22, 432(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r23, 440(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r24, 448(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r25, 456(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r26, 464(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r27, 472(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r28, 480(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r29, 488(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r30, 496(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r31, 504(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    std r2, 360(r1) # 8-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v2, 0, r3
+; CHECK-PWR7-NEXT:    lbz r3, 320(r1)
+; CHECK-PWR7-NEXT:    addi r4, r1, 336
+; CHECK-PWR7-NEXT:    stw r3, 60(r1) # 4-byte Folded Spill
+; CHECK-PWR7-NEXT:    stxvw4x v3, 0, r4
+; CHECK-PWR7-NEXT:    lbz r15, 334(r1)
+; CHECK-PWR7-NEXT:    lbz r14, 350(r1)
+; CHECK-PWR7-NEXT:    lbz r31, 335(r1)
+; CHECK-PWR7-NEXT:    lbz r2, 351(r1)
+; CHECK-PWR7-NEXT:    sub r15, r15, r14
+; CHECK-PWR7-NEXT:    sub r14, r31, r2
+; CHECK-PWR7-NEXT:    srawi r2, r14, 31
+; CHECK-PWR7-NEXT:    xor r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r3, 333(r1)
+; CHECK-PWR7-NEXT:    lbz r19, 331(r1)
+; CHECK-PWR7-NEXT:    lbz r18, 347(r1)
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    lbz r17, 332(r1)
+; CHECK-PWR7-NEXT:    lbz r16, 348(r1)
+; CHECK-PWR7-NEXT:    sub r17, r17, r16
+; CHECK-PWR7-NEXT:    lbz r23, 329(r1)
+; CHECK-PWR7-NEXT:    sub r14, r14, r2
+; CHECK-PWR7-NEXT:    lbz r2, 349(r1)
+; CHECK-PWR7-NEXT:    lbz r22, 345(r1)
+; CHECK-PWR7-NEXT:    lbz r4, 336(r1)
+; CHECK-PWR7-NEXT:    lbz r5, 321(r1)
+; CHECK-PWR7-NEXT:    lbz r6, 337(r1)
+; CHECK-PWR7-NEXT:    lbz r7, 322(r1)
+; CHECK-PWR7-NEXT:    lbz r8, 338(r1)
+; CHECK-PWR7-NEXT:    lbz r9, 323(r1)
+; CHECK-PWR7-NEXT:    lbz r10, 339(r1)
+; CHECK-PWR7-NEXT:    lbz r11, 324(r1)
+; CHECK-PWR7-NEXT:    lbz r12, 340(r1)
+; CHECK-PWR7-NEXT:    lbz r0, 325(r1)
+; CHECK-PWR7-NEXT:    lbz r30, 341(r1)
+; CHECK-PWR7-NEXT:    lbz r29, 326(r1)
+; CHECK-PWR7-NEXT:    lbz r28, 342(r1)
+; CHECK-PWR7-NEXT:    lbz r27, 327(r1)
+; CHECK-PWR7-NEXT:    lbz r26, 343(r1)
+; CHECK-PWR7-NEXT:    sub r3, r3, r2
+; CHECK-PWR7-NEXT:    lbz r25, 328(r1)
+; CHECK-PWR7-NEXT:    lbz r24, 344(r1)
+; CHECK-PWR7-NEXT:    lbz r21, 330(r1)
+; CHECK-PWR7-NEXT:    lbz r20, 346(r1)
 ; CHECK-PWR7-NEXT:    sub r5, r5, r6
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    srawi r4, r3, 31
+; CHECK-PWR7-NEXT:    srawi r18, r3, 31
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    srawi r31, r15, 31
+; CHECK-PWR7-NEXT:    ld r2, 360(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    xor r3, r3, r18
 ; CHECK-PWR7-NEXT:    srawi r6, r5, 31
-; CHECK-PWR7-NEXT:    xor r3, r3, r4
-; CHECK-PWR7-NEXT:    sldi r27, r27, 56
-; CHECK-PWR7-NEXT:    xor r5, r5, r6
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    sub r3, r3, r4
-; CHECK-PWR7-NEXT:    sldi r24, r24, 56
+; CHECK-PWR7-NEXT:    srawi r8, r7, 31
+; CHECK-PWR7-NEXT:    srawi r10, r9, 31
+; CHECK-PWR7-NEXT:    srawi r12, r11, 31
+; CHECK-PWR7-NEXT:    srawi r30, r0, 31
+; CHECK-PWR7-NEXT:    sub r3, r3, r18
+; CHECK-PWR7-NEXT:    srawi r18, r19, 31
+; CHECK-PWR7-NEXT:    srawi r28, r29, 31
+; CHECK-PWR7-NEXT:    ld r16, 384(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r27, 208(r1)
-; CHECK-PWR7-NEXT:    sub r4, r5, r6
-; CHECK-PWR7-NEXT:    std r27, 216(r1)
-; CHECK-PWR7-NEXT:    srawi r27, r29, 31
-; CHECK-PWR7-NEXT:    lbz r10, 313(r1)
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    std r24, 224(r1)
-; CHECK-PWR7-NEXT:    lbz r22, 329(r1)
-; CHECK-PWR7-NEXT:    std r24, 232(r1)
-; CHECK-PWR7-NEXT:    srawi r24, r30, 31
-; CHECK-PWR7-NEXT:    ld r21, 360(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r23, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 317(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 333(r1)
-; CHECK-PWR7-NEXT:    xor r29, r29, r27
-; CHECK-PWR7-NEXT:    std r3, 176(r1)
-; CHECK-PWR7-NEXT:    std r3, 184(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    sldi r23, r23, 56
-; CHECK-PWR7-NEXT:    xor r30, r30, r24
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    sub r4, r30, r24
-; CHECK-PWR7-NEXT:    ld r30, 432(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 160(r1)
-; CHECK-PWR7-NEXT:    std r3, 168(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    sub r3, r29, r27
-; CHECK-PWR7-NEXT:    std r23, 240(r1)
-; CHECK-PWR7-NEXT:    ld r29, 424(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r23, 248(r1)
-; CHECK-PWR7-NEXT:    ld r27, 408(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r23, r28, 31
+; CHECK-PWR7-NEXT:    srawi r26, r27, 31
+; CHECK-PWR7-NEXT:    srawi r24, r25, 31
+; CHECK-PWR7-NEXT:    xor r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r5, r5, r6
+; CHECK-PWR7-NEXT:    std r3, 272(r1)
+; CHECK-PWR7-NEXT:    std r3, 280(r1)
+; CHECK-PWR7-NEXT:    srawi r3, r17, 31
+; CHECK-PWR7-NEXT:    sub r19, r19, r18
+; CHECK-PWR7-NEXT:    xor r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r15, r15, r31
+; CHECK-PWR7-NEXT:    xor r17, r17, r3
+; CHECK-PWR7-NEXT:    xor r9, r9, r10
+; CHECK-PWR7-NEXT:    xor r11, r11, r12
+; CHECK-PWR7-NEXT:    xor r0, r0, r30
+; CHECK-PWR7-NEXT:    xor r29, r29, r28
+; CHECK-PWR7-NEXT:    xor r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r3, r17, r3
+; CHECK-PWR7-NEXT:    xor r25, r25, r24
+; CHECK-PWR7-NEXT:    sub r25, r25, r24
+; CHECK-PWR7-NEXT:    sub r27, r27, r26
+; CHECK-PWR7-NEXT:    sub r29, r29, r28
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    xor r28, r28, r23
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    std r3, 144(r1)
-; CHECK-PWR7-NEXT:    ld r24, 384(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    std r3, 152(r1)
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    sub r25, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 318(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 334(r1)
-; CHECK-PWR7-NEXT:    std r3, 128(r1)
+; CHECK-PWR7-NEXT:    sub r0, r0, r30
+; CHECK-PWR7-NEXT:    sub r11, r11, r12
+; CHECK-PWR7-NEXT:    sub r9, r9, r10
+; CHECK-PWR7-NEXT:    sub r7, r7, r8
+; CHECK-PWR7-NEXT:    sub r5, r5, r6
+; CHECK-PWR7-NEXT:    sldi r14, r14, 56
+; CHECK-PWR7-NEXT:    sldi r15, r15, 56
+; CHECK-PWR7-NEXT:    ld r31, 504(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r3, 256(r1)
+; CHECK-PWR7-NEXT:    std r3, 264(r1)
+; CHECK-PWR7-NEXT:    sldi r3, r19, 56
 ; CHECK-PWR7-NEXT:    sldi r25, r25, 56
-; CHECK-PWR7-NEXT:    std r3, 136(r1)
-; CHECK-PWR7-NEXT:    sub r3, r28, r23
+; CHECK-PWR7-NEXT:    sldi r27, r27, 56
+; CHECK-PWR7-NEXT:    std r3, 240(r1)
+; CHECK-PWR7-NEXT:    std r3, 248(r1)
+; CHECK-PWR7-NEXT:    sub r3, r23, r22
+; CHECK-PWR7-NEXT:    srawi r23, r3, 31
+; CHECK-PWR7-NEXT:    sub r22, r21, r20
+; CHECK-PWR7-NEXT:    srawi r21, r22, 31
+; CHECK-PWR7-NEXT:    sldi r29, r29, 56
+; CHECK-PWR7-NEXT:    sldi r0, r0, 56
+; CHECK-PWR7-NEXT:    sldi r11, r11, 56
+; CHECK-PWR7-NEXT:    xor r3, r3, r23
+; CHECK-PWR7-NEXT:    xor r22, r22, r21
+; CHECK-PWR7-NEXT:    sldi r9, r9, 56
+; CHECK-PWR7-NEXT:    sldi r7, r7, 56
+; CHECK-PWR7-NEXT:    sldi r5, r5, 56
+; CHECK-PWR7-NEXT:    ld r30, 496(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r28, 480(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    sub r3, r3, r23
+; CHECK-PWR7-NEXT:    sub r22, r22, r21
+; CHECK-PWR7-NEXT:    std r14, 304(r1)
+; CHECK-PWR7-NEXT:    ld r26, 464(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r3, 112(r1)
-; CHECK-PWR7-NEXT:    ld r28, 416(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    clrlwi r10, r10, 24
-; CHECK-PWR7-NEXT:    std r25, 256(r1)
-; CHECK-PWR7-NEXT:    std r25, 264(r1)
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    srawi r25, r26, 31
-; CHECK-PWR7-NEXT:    xor r26, r26, r25
-; CHECK-PWR7-NEXT:    ld r23, 376(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r3, 120(r1)
-; CHECK-PWR7-NEXT:    sub r4, r26, r25
-; CHECK-PWR7-NEXT:    clrlwi r22, r22, 24
-; CHECK-PWR7-NEXT:    srawi r7, r8, 31
-; CHECK-PWR7-NEXT:    sub r10, r10, r22
-; CHECK-PWR7-NEXT:    ld r26, 400(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
-; CHECK-PWR7-NEXT:    srawi r22, r10, 31
-; CHECK-PWR7-NEXT:    xor r8, r8, r7
-; CHECK-PWR7-NEXT:    xor r10, r10, r22
-; CHECK-PWR7-NEXT:    sub r10, r10, r22
-; CHECK-PWR7-NEXT:    ld r25, 392(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r12, r9, r11
-; CHECK-PWR7-NEXT:    lbz r9, 319(r1)
-; CHECK-PWR7-NEXT:    lbz r11, 335(r1)
-; CHECK-PWR7-NEXT:    std r3, 96(r1)
-; CHECK-PWR7-NEXT:    sldi r12, r12, 56
-; CHECK-PWR7-NEXT:    std r3, 104(r1)
-; CHECK-PWR7-NEXT:    ld r22, 368(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sldi r10, r10, 56
-; CHECK-PWR7-NEXT:    std r10, 192(r1)
-; CHECK-PWR7-NEXT:    clrlwi r9, r9, 24
-; CHECK-PWR7-NEXT:    clrlwi r11, r11, 24
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    std r12, 272(r1)
-; CHECK-PWR7-NEXT:    std r12, 280(r1)
-; CHECK-PWR7-NEXT:    srawi r12, r19, 31
-; CHECK-PWR7-NEXT:    xor r0, r19, r12
-; CHECK-PWR7-NEXT:    ld r19, 344(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r3, r0, r12
-; CHECK-PWR7-NEXT:    srawi r11, r9, 31
-; CHECK-PWR7-NEXT:    std r10, 200(r1)
-; CHECK-PWR7-NEXT:    xor r9, r9, r11
+; CHECK-PWR7-NEXT:    sldi r22, r22, 56
+; CHECK-PWR7-NEXT:    ld r24, 448(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r23, 440(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r14, 312(r1)
+; CHECK-PWR7-NEXT:    std r15, 288(r1)
+; CHECK-PWR7-NEXT:    std r3, 208(r1)
+; CHECK-PWR7-NEXT:    std r3, 216(r1)
+; CHECK-PWR7-NEXT:    lwz r3, 60(r1) # 4-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r15, 296(r1)
+; CHECK-PWR7-NEXT:    ld r21, 424(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r20, 416(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r22, 224(r1)
+; CHECK-PWR7-NEXT:    std r22, 232(r1)
+; CHECK-PWR7-NEXT:    sub r4, r3, r4
+; CHECK-PWR7-NEXT:    std r25, 192(r1)
+; CHECK-PWR7-NEXT:    ld r22, 432(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r19, 408(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    srawi r3, r4, 31
+; CHECK-PWR7-NEXT:    std r25, 200(r1)
+; CHECK-PWR7-NEXT:    ld r25, 456(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r27, 176(r1)
+; CHECK-PWR7-NEXT:    std r27, 184(r1)
+; CHECK-PWR7-NEXT:    xor r4, r4, r3
+; CHECK-PWR7-NEXT:    std r29, 160(r1)
+; CHECK-PWR7-NEXT:    ld r27, 472(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r29, 168(r1)
+; CHECK-PWR7-NEXT:    std r0, 144(r1)
+; CHECK-PWR7-NEXT:    sub r3, r4, r3
+; CHECK-PWR7-NEXT:    std r0, 152(r1)
+; CHECK-PWR7-NEXT:    ld r29, 488(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    ld r18, 400(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    sub r9, r9, r11
-; CHECK-PWR7-NEXT:    std r3, 80(r1)
-; CHECK-PWR7-NEXT:    std r3, 88(r1)
-; CHECK-PWR7-NEXT:    sldi r9, r9, 56
-; CHECK-PWR7-NEXT:    std r9, 288(r1)
-; CHECK-PWR7-NEXT:    std r9, 296(r1)
-; CHECK-PWR7-NEXT:    srawi r9, r20, 31
-; CHECK-PWR7-NEXT:    xor r11, r20, r9
-; CHECK-PWR7-NEXT:    ld r20, 352(r1) # 8-byte Folded Reload
-; CHECK-PWR7-NEXT:    sub r4, r11, r9
-; CHECK-PWR7-NEXT:    sldi r3, r4, 56
+; CHECK-PWR7-NEXT:    std r11, 128(r1)
+; CHECK-PWR7-NEXT:    ld r17, 392(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r11, 136(r1)
+; CHECK-PWR7-NEXT:    std r9, 112(r1)
 ; CHECK-PWR7-NEXT:    std r3, 64(r1)
 ; CHECK-PWR7-NEXT:    std r3, 72(r1)
-; CHECK-PWR7-NEXT:    sub r3, r8, r7
-; CHECK-PWR7-NEXT:    sldi r3, r3, 56
-; CHECK-PWR7-NEXT:    std r3, 48(r1)
-; CHECK-PWR7-NEXT:    std r3, 56(r1)
-; CHECK-PWR7-NEXT:    addi r3, r1, 288
+; CHECK-PWR7-NEXT:    addi r3, r1, 304
+; CHECK-PWR7-NEXT:    std r9, 120(r1)
+; CHECK-PWR7-NEXT:    ld r15, 376(r1) # 8-byte Folded Reload
+; CHECK-PWR7-NEXT:    std r7, 96(r1)
+; CHECK-PWR7-NEXT:    std r7, 104(r1)
+; CHECK-PWR7-NEXT:    std r5, 80(r1)
+; CHECK-PWR7-NEXT:    std r5, 88(r1)
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    addi r3, r1, 288
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 256
+; CHECK-PWR7-NEXT:    addi r3, r1, 272
+; CHECK-PWR7-NEXT:    ld r14, 368(r1) # 8-byte Folded Reload
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 240
+; CHECK-PWR7-NEXT:    addi r3, r1, 256
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 224
+; CHECK-PWR7-NEXT:    addi r3, r1, 240
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 208
+; CHECK-PWR7-NEXT:    addi r3, r1, 224
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 192
+; CHECK-PWR7-NEXT:    addi r3, r1, 208
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 176
+; CHECK-PWR7-NEXT:    addi r3, r1, 192
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 160
+; CHECK-PWR7-NEXT:    addi r3, r1, 176
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs0, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v2, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 144
+; CHECK-PWR7-NEXT:    addi r3, r1, 160
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 128
+; CHECK-PWR7-NEXT:    addi r3, r1, 144
 ; CHECK-PWR7-NEXT:    vmrghb v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 112
+; CHECK-PWR7-NEXT:    addi r3, r1, 128
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
+; CHECK-PWR7-NEXT:    addi r3, r1, 112
 ; CHECK-PWR7-NEXT:    vmrghh v2, v3, v2
 ; CHECK-PWR7-NEXT:    lxvw4x v3, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 80
+; CHECK-PWR7-NEXT:    addi r3, r1, 96
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 64
+; CHECK-PWR7-NEXT:    addi r3, r1, 80
 ; CHECK-PWR7-NEXT:    vmrghb v3, v4, v3
 ; CHECK-PWR7-NEXT:    lxvw4x v4, 0, r3
-; CHECK-PWR7-NEXT:    addi r3, r1, 48
+; CHECK-PWR7-NEXT:    addi r3, r1, 64
 ; CHECK-PWR7-NEXT:    lxvw4x v5, 0, r3
 ; CHECK-PWR7-NEXT:    vmrghb v4, v5, v4
 ; CHECK-PWR7-NEXT:    vmrghh v3, v4, v3
 ; CHECK-PWR7-NEXT:    xxmrghw vs1, v3, v2
 ; CHECK-PWR7-NEXT:    xxmrghd v2, vs1, vs0
-; CHECK-PWR7-NEXT:    addi r1, r1, 448
+; CHECK-PWR7-NEXT:    addi r1, r1, 512
 ; CHECK-PWR7-NEXT:    blr
 entry:
   %vecext = extractelement <16 x i8> %a, i32 0
diff --git a/llvm/test/CodeGen/PowerPC/tls-picgot.ll b/llvm/test/CodeGen/PowerPC/tls-picgot.ll
new file mode 100644
index 000000000000..6562d864d1ba
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/tls-picgot.ll
@@ -0,0 +1,31 @@
+; RUN: llc -verify-machineinstrs -relocation-model=pic < %s | FileCheck %s
+
+target triple = "powerpc-unknown-linux-gnu"
+
+; Test that LR is preserved when PPC32PICGOT clobbers it with a local "bl".
+
+@TLS = external thread_local global i8
+
+; CHECK-LABEL: tls_addr:
+; CHECK:        mflr [[SAVED_REG:[0-9]+]]
+
+; CHECK:        bl [[JUMP:\.L[[:alnum:]_]+]]
+; CHECK-NEXT:   [[OFFSET:\.L[[:alnum:]_]+]]:
+; CHECK-NEXT:   .long _GLOBAL_OFFSET_TABLE_-[[OFFSET]]
+; CHECK-NEXT:   [[JUMP]]
+; CHECK-NEXT:   mflr {{[0-9]+}}
+
+; CHECK:        mtlr [[SAVED_REG]]
+; CHECK-NEXT:   blr
+
+define ptr @tls_addr() unnamed_addr {
+  %1 = call ptr @llvm.threadlocal.address.p0(ptr @TLS)
+  ret ptr %1
+}
+
+declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 8, !"PIC Level", i32 2}
+!1 = !{i32 7, !"PIE Level", i32 2}
diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
index 246e6a614d6a..117e3e4aac45 100644
--- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll
+++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll
@@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IF-NEXT:    mv a1, a0
 ; RV32IF-NEXT:    addi a0, sp, 8
 ; RV32IF-NEXT:    call __fixdfti
-; RV32IF-NEXT:    lw a0, 8(sp)
-; RV32IF-NEXT:    lw a1, 12(sp)
-; RV32IF-NEXT:    lw a2, 20(sp)
+; RV32IF-NEXT:    lw a0, 20(sp)
+; RV32IF-NEXT:    lw a1, 8(sp)
+; RV32IF-NEXT:    lw a2, 12(sp)
 ; RV32IF-NEXT:    lw a3, 16(sp)
-; RV32IF-NEXT:    beqz a2, .LBB47_2
+; RV32IF-NEXT:    beqz a0, .LBB47_2
 ; RV32IF-NEXT:  # %bb.1: # %entry
-; RV32IF-NEXT:    slti a4, a2, 0
+; RV32IF-NEXT:    slti a4, a0, 0
 ; RV32IF-NEXT:    j .LBB47_3
 ; RV32IF-NEXT:  .LBB47_2:
 ; RV32IF-NEXT:    seqz a4, a3
 ; RV32IF-NEXT:  .LBB47_3: # %entry
 ; RV32IF-NEXT:    xori a3, a3, 1
-; RV32IF-NEXT:    or a3, a3, a2
+; RV32IF-NEXT:    or a3, a3, a0
 ; RV32IF-NEXT:    seqz a3, a3
 ; RV32IF-NEXT:    addi a3, a3, -1
 ; RV32IF-NEXT:    and a3, a3, a4
 ; RV32IF-NEXT:    neg a3, a3
+; RV32IF-NEXT:    and a2, a3, a2
 ; RV32IF-NEXT:    and a1, a3, a1
 ; RV32IF-NEXT:    and a0, a3, a0
-; RV32IF-NEXT:    and a2, a3, a2
-; RV32IF-NEXT:    slti a2, a2, 0
-; RV32IF-NEXT:    addi a2, a2, -1
-; RV32IF-NEXT:    and a0, a2, a0
-; RV32IF-NEXT:    and a1, a2, a1
+; RV32IF-NEXT:    slti a0, a0, 0
+; RV32IF-NEXT:    addi a3, a0, -1
+; RV32IF-NEXT:    and a0, a3, a1
+; RV32IF-NEXT:    and a1, a3, a2
 ; RV32IF-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IF-NEXT:    .cfi_restore ra
 ; RV32IF-NEXT:    addi sp, sp, 32
@@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) {
 ; RV32IFD-NEXT:    .cfi_offset ra, -4
 ; RV32IFD-NEXT:    addi a0, sp, 8
 ; RV32IFD-NEXT:    call __fixdfti
-; RV32IFD-NEXT:    lw a0, 8(sp)
-; RV32IFD-NEXT:    lw a1, 12(sp)
-; RV32IFD-NEXT:    lw a2, 20(sp)
+; RV32IFD-NEXT:    lw a0, 20(sp)
+; RV32IFD-NEXT:    lw a1, 8(sp)
+; RV32IFD-NEXT:    lw a2, 12(sp)
 ; RV32IFD-NEXT:    lw a3, 16(sp)
-; RV32IFD-NEXT:    beqz a2, .LBB47_2
+; RV32IFD-NEXT:    beqz a0, .LBB47_2
 ; RV32IFD-NEXT:  # %bb.1: # %entry
-; RV32IFD-NEXT:    slti a4, a2, 0
+; RV32IFD-NEXT:    slti a4, a0, 0
 ; RV32IFD-NEXT:    j .LBB47_3
 ; RV32IFD-NEXT:  .LBB47_2:
 ; RV32IFD-NEXT:    seqz a4, a3
 ; RV32IFD-NEXT:  .LBB47_3: # %entry
 ; RV32IFD-NEXT:    xori a3, a3, 1
-; RV32IFD-NEXT:    or a3, a3, a2
+; RV32IFD-NEXT:    or a3, a3, a0
 ; RV32IFD-NEXT:    seqz a3, a3
 ; RV32IFD-NEXT:    addi a3, a3, -1
 ; RV32IFD-NEXT:    and a3, a3, a4
 ; RV32IFD-NEXT:    neg a3, a3
+; RV32IFD-NEXT:    and a2, a3, a2
 ; RV32IFD-NEXT:    and a1, a3, a1
 ; RV32IFD-NEXT:    and a0, a3, a0
-; RV32IFD-NEXT:    and a2, a3, a2
-; RV32IFD-NEXT:    slti a2, a2, 0
-; RV32IFD-NEXT:    addi a2, a2, -1
-; RV32IFD-NEXT:    and a0, a2, a0
-; RV32IFD-NEXT:    and a1, a2, a1
+; RV32IFD-NEXT:    slti a0, a0, 0
+; RV32IFD-NEXT:    addi a3, a0, -1
+; RV32IFD-NEXT:    and a0, a3, a1
+; RV32IFD-NEXT:    and a1, a3, a2
 ; RV32IFD-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32IFD-NEXT:    .cfi_restore ra
 ; RV32IFD-NEXT:    addi sp, sp, 32
@@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) {
 ; RV32-NEXT:    .cfi_offset ra, -4
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB50_2
+; RV32-NEXT:    beqz a0, .LBB50_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB50_3
 ; RV32-NEXT:  .LBB50_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB50_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
@@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) {
 ; RV32-NEXT:    call __extendhfsf2
 ; RV32-NEXT:    addi a0, sp, 8
 ; RV32-NEXT:    call __fixsfti
-; RV32-NEXT:    lw a0, 8(sp)
-; RV32-NEXT:    lw a1, 12(sp)
-; RV32-NEXT:    lw a2, 20(sp)
+; RV32-NEXT:    lw a0, 20(sp)
+; RV32-NEXT:    lw a1, 8(sp)
+; RV32-NEXT:    lw a2, 12(sp)
 ; RV32-NEXT:    lw a3, 16(sp)
-; RV32-NEXT:    beqz a2, .LBB53_2
+; RV32-NEXT:    beqz a0, .LBB53_2
 ; RV32-NEXT:  # %bb.1: # %entry
-; RV32-NEXT:    slti a4, a2, 0
+; RV32-NEXT:    slti a4, a0, 0
 ; RV32-NEXT:    j .LBB53_3
 ; RV32-NEXT:  .LBB53_2:
 ; RV32-NEXT:    seqz a4, a3
 ; RV32-NEXT:  .LBB53_3: # %entry
 ; RV32-NEXT:    xori a3, a3, 1
-; RV32-NEXT:    or a3, a3, a2
+; RV32-NEXT:    or a3, a3, a0
 ; RV32-NEXT:    seqz a3, a3
 ; RV32-NEXT:    addi a3, a3, -1
 ; RV32-NEXT:    and a3, a3, a4
 ; RV32-NEXT:    neg a3, a3
+; RV32-NEXT:    and a2, a3, a2
 ; RV32-NEXT:    and a1, a3, a1
 ; RV32-NEXT:    and a0, a3, a0
-; RV32-NEXT:    and a2, a3, a2
-; RV32-NEXT:    slti a2, a2, 0
-; RV32-NEXT:    addi a2, a2, -1
-; RV32-NEXT:    and a0, a2, a0
-; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    slti a0, a0, 0
+; RV32-NEXT:    addi a3, a0, -1
+; RV32-NEXT:    and a0, a3, a1
+; RV32-NEXT:    and a1, a3, a2
 ; RV32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    .cfi_restore ra
 ; RV32-NEXT:    addi sp, sp, 32
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
index b1a6d163664e..97d102561129 100644
--- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -7,18 +7,18 @@
 define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ; RV32-LABEL: ctz_nxv4i32:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT:    vid.v v10
-; RV32-NEXT:    vmv.v.i v11, -1
 ; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    li a1, -1
 ; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmsne.vi v0, v8, 0
 ; RV32-NEXT:    srli a0, a0, 1
 ; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV32-NEXT:    vmv.v.x v8, a0
-; RV32-NEXT:    vmacc.vv v8, v10, v11
-; RV32-NEXT:    vmv.v.i v9, 0
-; RV32-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV32-NEXT:    vmadd.vx v10, a1, v8
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV32-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV32-NEXT:    vmv.x.s a1, v8
 ; RV32-NEXT:    sub a0, a0, a1
@@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
 ;
 ; RV64-LABEL: ctz_nxv4i32:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT:    vid.v v10
-; RV64-NEXT:    vmv.v.i v11, -1
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT:    vid.v v10
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    srli a0, a0, 1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmacc.vv v8, v10, v11
-; RV64-NEXT:    vmv.v.i v9, 0
-; RV64-NEXT:    vmerge.vvm v8, v9, v8, v0
+; RV64-NEXT:    vmadd.vx v10, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    subw a0, a0, a1
@@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
 ;
 ; RV64-LABEL: ctz_nxv8i1_no_range:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vid.v v16
-; RV64-NEXT:    vmv.v.i v24, -1
 ; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    li a1, -1
 ; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV64-NEXT:    vmsne.vi v0, v8, 0
 ; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.v.x v8, a0
-; RV64-NEXT:    vmacc.vv v8, v16, v24
-; RV64-NEXT:    vmv.v.i v16, 0
-; RV64-NEXT:    vmerge.vvm v8, v16, v8, v0
+; RV64-NEXT:    vmadd.vx v16, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
 ; RV64-NEXT:    vredmaxu.vs v8, v8, v8
 ; RV64-NEXT:    vmv.x.s a1, v8
 ; RV64-NEXT:    sub a0, a0, a1
diff --git a/llvm/test/CodeGen/RISCV/pr148084.ll b/llvm/test/CodeGen/RISCV/pr148084.ll
new file mode 100644
index 000000000000..9fa26c74021c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr148084.ll
@@ -0,0 +1,279 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s | FileCheck %s
+
+source_filename = "external/libaom/av1/encoder/tx_search.c"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64-unknown-linux-android10000"
+
+define fastcc void @search_tx_type() #0 {
+; CHECK-LABEL: search_tx_type:
+; CHECK:       # %bb.0: # %._crit_edge.i
+; CHECK-NEXT:  # %bb.1: # %bb
+; CHECK-NEXT:    lbu a1, 0(zero)
+; CHECK-NEXT:    lw a0, 0(zero)
+; CHECK-NEXT:    lh a2, 0(zero)
+; CHECK-NEXT:    seqz a1, a1
+; CHECK-NEXT:    srai a3, a0, 63
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    andi a2, a1, 1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    or a3, a3, a0
+; CHECK-NEXT:    or a2, a2, a3
+; CHECK-NEXT:    bgez a2, .LBB0_3
+; CHECK-NEXT:  # %bb.2:
+; CHECK-NEXT:    bexti a3, a1, 1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:  .LBB0_3: # %bb
+; CHECK-NEXT:    andi a4, a1, 4
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a4, .LBB0_5
+; CHECK-NEXT:  # %bb.4: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_5: # %bb
+; CHECK-NEXT:    blt a2, a0, .LBB0_7
+; CHECK-NEXT:  # %bb.6: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_7: # %bb
+; CHECK-NEXT:    andi a5, a1, 8
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_9: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_11
+; CHECK-NEXT:  # %bb.10: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_11: # %bb
+; CHECK-NEXT:    andi a5, a1, 16
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_13
+; CHECK-NEXT:  # %bb.12: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_13: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_15
+; CHECK-NEXT:  # %bb.14: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_15: # %bb
+; CHECK-NEXT:    andi a5, a1, 32
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_17
+; CHECK-NEXT:  # %bb.16: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_17: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_19
+; CHECK-NEXT:  # %bb.18: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_19: # %bb
+; CHECK-NEXT:    andi a5, a1, 64
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_21
+; CHECK-NEXT:  # %bb.20: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_21: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_23
+; CHECK-NEXT:  # %bb.22: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_23: # %bb
+; CHECK-NEXT:    andi a5, a1, 128
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_25
+; CHECK-NEXT:  # %bb.24: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_25: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_27
+; CHECK-NEXT:  # %bb.26: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_27: # %bb
+; CHECK-NEXT:    andi a5, a1, 256
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_29
+; CHECK-NEXT:  # %bb.28: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_29: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_31
+; CHECK-NEXT:  # %bb.30: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_31: # %bb
+; CHECK-NEXT:    andi a5, a1, 512
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    beqz a5, .LBB0_33
+; CHECK-NEXT:  # %bb.32: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_33: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_35
+; CHECK-NEXT:  # %bb.34: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_35: # %bb
+; CHECK-NEXT:    andi a5, a1, 1024
+; CHECK-NEXT:    sext.w a4, a2
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:    beqz a5, .LBB0_37
+; CHECK-NEXT:  # %bb.36: # %bb
+; CHECK-NEXT:    mv a3, a0
+; CHECK-NEXT:  .LBB0_37: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_39
+; CHECK-NEXT:  # %bb.38: # %bb
+; CHECK-NEXT:    mv a3, a2
+; CHECK-NEXT:  .LBB0_39: # %bb
+; CHECK-NEXT:    slli a5, a1, 52
+; CHECK-NEXT:    sext.w a4, a3
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:    bgez a5, .LBB0_41
+; CHECK-NEXT:  # %bb.40: # %bb
+; CHECK-NEXT:    mv a2, a0
+; CHECK-NEXT:  .LBB0_41: # %bb
+; CHECK-NEXT:    blt a4, a0, .LBB0_43
+; CHECK-NEXT:  # %bb.42: # %bb
+; CHECK-NEXT:    mv a2, a3
+; CHECK-NEXT:  .LBB0_43: # %bb
+; CHECK-NEXT:    slli a4, a1, 51
+; CHECK-NEXT:    sext.w a3, a2
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    bltz a4, .LBB0_49
+; CHECK-NEXT:  # %bb.44: # %bb
+; CHECK-NEXT:    bge a3, a0, .LBB0_50
+; CHECK-NEXT:  .LBB0_45: # %bb
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    blt a2, a0, .LBB0_47
+; CHECK-NEXT:  .LBB0_46: # %bb
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB0_47: # %bb
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:  # %bb.48: # %get_tx_mask.exit
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_49: # %bb
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    blt a3, a0, .LBB0_45
+; CHECK-NEXT:  .LBB0_50: # %bb
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:    sext.w a2, a2
+; CHECK-NEXT:    bge a2, a0, .LBB0_46
+; CHECK-NEXT:    j .LBB0_47
+._crit_edge.i:
+  %.in196.i = load i16, ptr null, align 2
+  %i2 = load i16, ptr null, align 2
+  %i3 = and i16 %i2, %.in196.i
+  %i9 = trunc nuw i8 0 to i1
+  br i1 %i9, label %get_tx_mask.exit, label %bb
+
+bb:                                               ; preds = %._crit_edge.i
+  %i13 = load i8, ptr null, align 1
+  %i14 = icmp eq i8 %i13, 0
+  %spec.select211.i = select i1 %i14, i16 0, i16 %i3
+  %i19 = load i32, ptr null, align 4
+  %i20 = zext i16 %spec.select211.i to i32
+  %i21 = load i32, ptr null, align 4
+  %i22 = icmp sgt i32 %i21, -1
+  %i23 = and i32 %i20, 1
+  %.not203.i = icmp eq i32 %i23, 0
+  %spec.select212.i = select i1 %.not203.i, i32 -1, i32 %i21
+  %.1174.i = select i1 %i22, i32 %spec.select212.i, i32 -1
+  %i28 = icmp sgt i32 0, %.1174.i
+  %i29 = and i32 %i20, 2
+  %.not203.1.not.i = icmp eq i32 %i29, 0
+  %spec.select212.1.i = select i1 %.not203.1.not.i, i32 %.1174.i, i32 0
+  %.1174.1.i = select i1 %i28, i32 %spec.select212.1.i, i32 %.1174.i
+  %i30 = load i32, ptr null, align 4
+  %i31 = icmp sgt i32 %i30, %.1174.1.i
+  %i32 = and i32 %i20, 4
+  %.not203.2.i = icmp eq i32 %i32, 0
+  %spec.select212.2.i = select i1 %.not203.2.i, i32 %.1174.1.i, i32 %i30
+  %.1174.2.i = select i1 %i31, i32 %spec.select212.2.i, i32 %.1174.1.i
+  %i36 = load i32, ptr null, align 4
+  %i37 = icmp sgt i32 %i36, %.1174.2.i
+  %i38 = and i32 %i20, 8
+  %.not203.3.i = icmp eq i32 %i38, 0
+  %spec.select212.3.i = select i1 %.not203.3.i, i32 %.1174.2.i, i32 %i36
+  %.1174.3.i = select i1 %i37, i32 %spec.select212.3.i, i32 %.1174.2.i
+  %i42 = load i32, ptr null, align 4
+  %i43 = icmp sgt i32 %i42, %.1174.3.i
+  %i44 = and i32 %i20, 16
+  %.not203.4.i = icmp eq i32 %i44, 0
+  %spec.select212.4.i = select i1 %.not203.4.i, i32 %.1174.3.i, i32 %i42
+  %.1174.4.i = select i1 %i43, i32 %spec.select212.4.i, i32 %.1174.3.i
+  %i48 = load i32, ptr null, align 4
+  %i49 = icmp sgt i32 %i48, %.1174.4.i
+  %i50 = and i32 %i20, 32
+  %.not203.5.i = icmp eq i32 %i50, 0
+  %spec.select212.5.i = select i1 %.not203.5.i, i32 %.1174.4.i, i32 %i48
+  %.1174.5.i = select i1 %i49, i32 %spec.select212.5.i, i32 %.1174.4.i
+  %i51 = load i32, ptr null, align 4
+  %i52 = icmp sgt i32 %i51, %.1174.5.i
+  %i53 = and i32 %i20, 64
+  %.not203.6.i = icmp eq i32 %i53, 0
+  %spec.select212.6.i = select i1 %.not203.6.i, i32 %.1174.5.i, i32 %i51
+  %.1174.6.i = select i1 %i52, i32 %spec.select212.6.i, i32 %.1174.5.i
+  %i56 = load i32, ptr null, align 4
+  %i57 = icmp sgt i32 %i56, %.1174.6.i
+  %i58 = and i32 %i20, 128
+  %.not203.7.i = icmp eq i32 %i58, 0
+  %spec.select212.7.i = select i1 %.not203.7.i, i32 %.1174.6.i, i32 %i56
+  %.1174.7.i = select i1 %i57, i32 %spec.select212.7.i, i32 %.1174.6.i
+  %i60 = load i32, ptr null, align 4
+  %i61 = icmp sgt i32 %i60, %.1174.7.i
+  %i62 = and i32 %i20, 256
+  %.not203.8.i = icmp eq i32 %i62, 0
+  %spec.select212.8.i = select i1 %.not203.8.i, i32 %.1174.7.i, i32 %i60
+  %.1174.8.i = select i1 %i61, i32 %spec.select212.8.i, i32 %.1174.7.i
+  %i63 = load i32, ptr null, align 4
+  %i64 = icmp sgt i32 %i63, %.1174.8.i
+  %i65 = and i32 %i20, 512
+  %.not203.9.i = icmp eq i32 %i65, 0
+  %spec.select212.9.i = select i1 %.not203.9.i, i32 %.1174.8.i, i32 %i63
+  %.1174.9.i = select i1 %i64, i32 %spec.select212.9.i, i32 %.1174.8.i
+  %i67 = load i32, ptr null, align 4
+  %i68 = icmp sgt i32 %i67, %.1174.9.i
+  %i69 = and i32 %i20, 1024
+  %.not203.10.i = icmp eq i32 %i69, 0
+  %spec.select212.10.i = select i1 %.not203.10.i, i32 %.1174.9.i, i32 %i67
+  %.1174.10.i = select i1 %i68, i32 %spec.select212.10.i, i32 %.1174.9.i
+  %i70 = load i32, ptr null, align 4
+  %i71 = icmp sgt i32 %i70, %.1174.10.i
+  %i72 = and i32 %i20, 2048
+  %.not203.11.i = icmp eq i32 %i72, 0
+  %spec.select212.11.i = select i1 %.not203.11.i, i32 %.1174.10.i, i32 %i70
+  %.1174.11.i = select i1 %i71, i32 %spec.select212.11.i, i32 %.1174.10.i
+  %i75 = load i32, ptr null, align 4
+  %i76 = icmp sgt i32 %i75, %.1174.11.i
+  %i77 = and i32 %i20, 4096
+  %.not203.12.i = icmp eq i32 %i77, 0
+  %spec.select212.12.i = select i1 %.not203.12.i, i32 %.1174.11.i, i32 %i75
+  %.1174.12.i = select i1 %i76, i32 %spec.select212.12.i, i32 %.1174.11.i
+  %i80 = load i32, ptr null, align 4
+  %i81 = icmp sgt i32 %i80, %.1174.12.i
+  %spec.select212.13.i = select i1 false, i32 %.1174.12.i, i32 %i80
+  %.1174.13.i = select i1 %i81, i32 %spec.select212.13.i, i32 %.1174.12.i
+  %.1172.13.i = select i1 %i81, i32 13, i32 0
+  %i84 = icmp sgt i32 0, %.1174.13.i
+  %.1172.14.i = select i1 %i84, i32 14, i32 %.1172.13.i
+  %i88 = icmp slt i32 0, %i19
+  %i89 = select i1 %i88, i16 -32768, i16 0
+  %i90 = zext i16 %i89 to i32
+  %i91 = shl nuw nsw i32 1, %.1172.14.i
+  %i92 = and i32 %i91, %i90
+  %.not200.i = icmp eq i32 %i92, 0
+  %i93 = trunc nuw i32 %i91 to i16
+  %i94 = xor i16 %i93, -1
+  %i95 = select i1 %.not200.i, i16 -1, i16 %i94
+  %.2177.i = and i16 %i95, %i89
+  %i96 = xor i16 %.2177.i, -1
+  %i97 = and i16 %spec.select211.i, %i96
+  br label %get_tx_mask.exit
+
+get_tx_mask.exit:                                 ; preds = %._crit_edge.i, %bb
+  %.1261.i = phi i16 [ %i97, %bb ], [ 0, %._crit_edge.i ]
+  %i99 = icmp eq i16 %.1261.i, 0
+  %.2262.i = select i1 %i99, i16 0, i16 %.1261.i
+  ret void
+}
+
+attributes #0 = { noimplicitfloat nounwind sspstrong uwtable vscale_range(2,1024) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+b,+c,+d,+f,+m,+relax,+unaligned-scalar-mem,+unaligned-vector-mem,+v,+zaamo,+zalrsc,+zba,+zbb,+zbs,+zca,+zcd,+zicsr,+zifencei,+zmmul,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-e,-experimental-p,-experimental-smctr,-experimental-ssctr,-experimental-svukte,-experimental-xqccmp,-experimental-xqcia,-experimental-xqciac,-experimental-xqcibi,-experimental-xqcibm,-experimental-xqcicli,-experimental-xqcicm,-experimental-xqcics,-experimental-xqcicsr,-experimental-xqciint,-experimental-xqciio,-experimental-xqcilb,-experimental-xqcili,-experimental-xqcilia,-experimental-xqcilo,-experimental-xqcilsm,-experimental-xqcisim,-experimental-xqcisls,-experimental-xqcisync,-experimental-xrivosvisni,-experimental-xrivosvizip,-experimental-xsfmclic,-experimental-xsfsclic,-experimental-zalasr,-experimental-zicfilp,-experimental-zicfiss,-experimental-zvbc32e,-experimental-zvkgs,-experimental-zvqdotq,-h,-q,-sdext,-sdtrig,-sha,-shcounterenw,-shgatpa,-shlcofideleg,-shtvala,-shvsatpa,-shvstvala,-shvstvecd,-smaia,-smcdeleg,-smcntrpmf,-smcsrind,-smdbltrp,-smepmp,-smmpm,-smnpm,-smrnmi,-smstateen,-ssaia,-ssccfg,-ssccptr,-sscofpmf,-sscounterenw,-sscsrind,-ssdbltrp,-ssnpm,-sspm,-ssqosid,-ssstateen,-ssstrict,-sstc,-sstvala,-sstvecd,-ssu64xl,-supm,-svade,-svadu,-svbare,-svinval,-svnapot,-svpbmt,-svvptc,-xandesperf,-xandesvbfhcvt,-xandesvdot,-xandesvpackfph,-xcvalu,-xcvbi,-xcvbitmanip,-xcvelw,-xcvmac,-xcvmem,-xcvsimd,-xmipscmov,-xmipslsp,-xsfcease,-xsfmm128t,-xsfmm16t,-xsfmm32a16f,-xsfmm32a32f,-xsfmm32a8f,-xsfmm32a8i,-xsfmm32t,-xsfmm64a64f,-xsfmm64t,-xsfmmbase,-xsfvcp,-xsfvfnrclipxfqf,-xsfvfwmaccqqq,-xsfvqmaccdod,-xsfvqmaccqoq,-xsifivecdiscarddlone,-xsifivecflushdlone,-xtheadba,-xtheadbb,-xtheadbs,-xtheadcmo,-xtheadcondmov,-xtheadfmemidx,-xtheadmac,-xtheadmemidx,-xtheadmempair,-xtheadsync,-xtheadvdot,-xventanacondops,-xwchc,-za128rs,-za64rs,-zabha,-zacas,-zama16b,-zawrs,-zbc,-zbkb,-zbkc,-zbkx,-zcb,-zce,-zcf,-zclsd,-zcmop,-zcmp,-zcmt,-zdinx,-zfa,-zfbfmin,-zfh,-zfhmin,-zfinx,-zhinx,-zhinxmin,-zic64b,-zicbom,-zicbop,-zicboz,-ziccamoa,-ziccamoc,-ziccif,-zicclsm,-ziccrse,-zicntr,-zicond,-zihintntl,-zihintpause,-zihpm,-zilsd,-zimop,-zk,-zkn,-zknd,-zkne,-zknh,-zkr,-zks,-zksed,-zksh,-zkt,-ztso,-zvbb,-zvbc,-zvfbfmin,-zvfbfwma,-zvfh,-zvfhmin,-zvkb,-zvkg,-zvkn,-zvknc,-zvkned,-zvkng,-zvknha,-zvknhb,-zvks,-zvksc,-zvksed,-zvksg,-zvksh,-zvkt,-zvl1024b,-zvl16384b,-zvl2048b,-zvl256b,-zvl32768b,-zvl4096b,-zvl512b,-zvl65536b,-zvl8192b" }
diff --git a/llvm/test/CodeGen/RISCV/pr153598.mir b/llvm/test/CodeGen/RISCV/pr153598.mir
new file mode 100644
index 000000000000..a084197fe83c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/pr153598.mir
@@ -0,0 +1,23 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv32 -mattr=+zcmp -run-pass=riscv-move-merge -verify-machineinstrs %s -o - | FileCheck %s
+---
+name: mov-merge
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: $x8, $x9
+    ; CHECK-LABEL: name: mov-merge
+    ; CHECK: liveins: $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $x12 = ADDI $x0, -3
+    ; CHECK-NEXT: SW renamable $x9, $x2, 56
+    ; CHECK-NEXT: CM_MVA01S killed renamable $x9, renamable $x8, implicit-def $x10, implicit-def $x11
+    ; CHECK-NEXT: SW renamable $x8, $x2, 60
+    ; CHECK-NEXT: PseudoRET
+    $x12 = ADDI $x0, -3
+    SW renamable $x9, $x2, 56
+    $x10 = ADDI killed renamable $x9, 0
+    SW renamable $x8, $x2, 60
+    $x11 = ADDI killed renamable $x8, 0
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index a050034c6316..a7eaf3979323 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -78,12 +78,12 @@ body: |
     ; CHECK-NEXT: %false:vrnov0 = COPY $v9
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
     ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 8, 5 /* e32 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %pt, %true, 4, 5 /* e32 */, 0 /* tu, mu */
     %pt:vrnov0 = COPY $v8
     %false:vrnov0 = COPY $v9
     %mask:vmv0 = COPY $v0
-    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
-    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 8, 5 /* e32 */
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 8, 5 /* e32 */, 0 /* tu, mu */
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 4, 5 /* e32 */
 ...
 ---
 # Shouldn't be converted because false operands are different
@@ -163,3 +163,47 @@ body: |
     %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
   bb.1:
     %5:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, %mask, 4, 5 /* e32 */
+...
+---
+# Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+name: preserve_false
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0, $x8, $x9
+    ; CHECK-LABEL: name: preserve_false
+    ; CHECK: liveins: $v8, $v9, $v0, $x8, $x9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %false:vr = COPY $v9
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %avl1:gprnox0 = COPY $x8
+    ; CHECK-NEXT: %avl2:gprnox0 = COPY $x9
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $v9
+    %mask:vmv0 = COPY $v0
+    %avl1:gprnox0 = COPY $x8
+    %avl2:gprnox0 = COPY $x9
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, %avl1, 5 /* e32 */, 3 /* ta, ma */
+    %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, %avl2, 5 /* e32 */
+...
+---
+# But we can convert this one because vmerge's avl being <= true's means we don't lose any false elements past avl.
+name: preserve_false_avl_known_le
+body: |
+  bb.0:
+    liveins: $v8, $v9, $v0
+    ; CHECK-LABEL: name: preserve_false_avl_known_le
+    ; CHECK: liveins: $v8, $v9, $v0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vr = COPY $v8
+    ; CHECK-NEXT: %false:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %mask:vmv0 = COPY $v0
+    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 1, 5 /* e32 */, 3 /* ta, ma */
+    ; CHECK-NEXT: [[PseudoVMV_V_V_M1_:%[0-9]+]]:vr = PseudoVMV_V_V_M1 %pt, %true, 1, 5 /* e32 */, 0 /* tu, mu */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $v9
+    %mask:vmv0 = COPY $v0
+    %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 2, 5 /* e32 */, 3 /* ta, ma */
+    %5:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, %mask, 1, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
index 3aeb4e864627..9ffc84a8a0e4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-vmerge-to-vmv.ll
@@ -71,10 +71,31 @@ define <vscale x 8 x i64> @vpmerge_m8(<vscale x 8 x i64> %x, <vscale x 8 x i64>
   ret <vscale x 8 x i64> %1
 }
 
-declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
-declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
-declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
-declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
-declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
-declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
-declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+; Shouldn't be converted because vmerge adds back in elements from false past avl that would be lost if we converted to vmv.v.v
+define <vscale x 2 x i32> @preserve_false(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask, i64 %avl1, i64 %avl2) {
+; CHECK-LABEL: preserve_false:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v9
+; CHECK-NEXT:    vle32.v v10, (a0), v0.t
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v9, v10, v0
+; CHECK-NEXT:    ret
+  %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 %avl1, i64 3)
+  %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 %avl2)
+  ret <vscale x 2 x i32> %res
+}
+
+; Can fold this because its avl is known to be <= than true, so no elements from false need to be introduced past avl.
+define <vscale x 2 x i32> @preserve_false_avl_known_le(ptr %p, <vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: preserve_false_avl_known_le:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a0), v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %true = call <vscale x 2 x i32> @llvm.riscv.vle.mask(<vscale x 2 x i32> %false, ptr %p, <vscale x 2 x i1> %mask, i64 2, i64 3)
+  %res = call <vscale x 2 x i32> @llvm.riscv.vmerge(<vscale x 2 x i32> %pt, <vscale x 2 x i32> %false, <vscale x 2 x i32> %true, <vscale x 2 x i1> %mask, i64 1)
+  ret <vscale x 2 x i32> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll b/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
index f29c74ae69bf..697c582dcb38 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsha2cl.ll
@@ -21,7 +21,7 @@ define <vscale x 4 x i32> @intrinsic_vsha2cl_vv_nxv4i32_nxv4i32(<vscale x 4 x i3
 ; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv4i32_nxv4i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v10, v12
+; CHECK-NEXT:    vsha2cl.vv v8, v10, v12
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i32> @llvm.riscv.vsha2cl.nxv4i32.nxv4i32(
@@ -45,7 +45,7 @@ define <vscale x 8 x i32> @intrinsic_vsha2cl_vv_nxv8i32_nxv8i32(<vscale x 8 x i3
 ; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv8i32_nxv8i32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v12, v16
+; CHECK-NEXT:    vsha2cl.vv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i32> @llvm.riscv.vsha2cl.nxv8i32.nxv8i32(
@@ -70,7 +70,7 @@ define <vscale x 16 x i32> @intrinsic_vsha2cl_vv_nxv16i32_nxv16i32(<vscale x 16
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re32.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v16, v24
+; CHECK-NEXT:    vsha2cl.vv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 16 x i32> @llvm.riscv.vsha2cl.nxv16i32.nxv16i32(
@@ -94,7 +94,7 @@ define <vscale x 4 x i64> @intrinsic_vsha2cl_vv_nxv4i64_nxv4i64(<vscale x 4 x i6
 ; CHECK-LABEL: intrinsic_vsha2cl_vv_nxv4i64_nxv4i64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v12, v16
+; CHECK-NEXT:    vsha2cl.vv v8, v12, v16
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 4 x i64> @llvm.riscv.vsha2cl.nxv4i64.nxv4i64(
@@ -119,7 +119,7 @@ define <vscale x 8 x i64> @intrinsic_vsha2cl_vv_nxv8i64_nxv8i64(<vscale x 8 x i6
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vl8re64.v v24, (a0)
 ; CHECK-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
-; CHECK-NEXT:    vsha2ch.vv v8, v16, v24
+; CHECK-NEXT:    vsha2cl.vv v8, v16, v24
 ; CHECK-NEXT:    ret
 entry:
   %a = call <vscale x 8 x i64> @llvm.riscv.vsha2cl.nxv8i64.nxv8i64(
diff --git a/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir b/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
new file mode 100644
index 000000000000..9ab7f41045c4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/stack-probing-frame-setup.mir
@@ -0,0 +1,198 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=riscv64 -x mir -run-pass=prologepilog  -verify-machineinstrs < %s \
+# RUN:  | FileCheck %s -check-prefixes=RV64I
+# RUN: llc -mtriple=riscv32 -x mir -run-pass=prologepilog  -verify-machineinstrs < %s \
+# RUN:  | FileCheck %s -check-prefixes=RV32I
+--- |
+  ; Function Attrs: uwtable
+  define void @no_reserved_call_frame(i64 %n) #0 {
+  entry:
+    %v = alloca i32, i64 %n, align 4
+    call void @callee_stack_args(ptr %v, [518 x i64] poison)
+    ret void
+  }
+
+  declare void @callee_stack_args(ptr, [518 x i64]) #1
+
+  attributes #0 = { uwtable "frame-pointer"="none" "probe-stack"="inline-asm" "target-features"="+m" }
+  attributes #1 = { "target-features"="+m" }
+...
+---
+name:            no_reserved_call_frame
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHContTarget: false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: true
+registers:       []
+liveins:
+  - { reg: '$x10', virtual-reg: '' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  isCalleeSavedInfoValid: false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:
+  - { id: 0, name: v, type: variable-sized, offset: 0, alignment: 1, stack-id: default,
+      callee-saved-register: '', callee-saved-restored: true, local-offset: 0,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo:
+  varArgsFrameIndex: 0
+  varArgsSaveSize: 0
+body:             |
+  ; RV64I-LABEL: name: no_reserved_call_frame
+  ; RV64I: bb.0.entry:
+  ; RV64I-NEXT:   successors: %bb.1(0x80000000)
+  ; RV64I-NEXT:   liveins: $x10, $x1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = frame-setup ADDI $x2, -16
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; RV64I-NEXT:   frame-setup SD killed $x1, $x2, 8 :: (store (s64) into %stack.1)
+  ; RV64I-NEXT:   frame-setup SD killed $x8, $x2, 0 :: (store (s64) into %stack.2)
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -8
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -16
+  ; RV64I-NEXT:   $x8 = frame-setup ADDI $x2, 16
+  ; RV64I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
+  ; RV64I-NEXT:   renamable $x10 = SLLI killed renamable $x10, 2
+  ; RV64I-NEXT:   renamable $x10 = nuw ADDI killed renamable $x10, 15
+  ; RV64I-NEXT:   renamable $x10 = ANDI killed renamable $x10, -16
+  ; RV64I-NEXT:   renamable $x10 = SUB $x2, killed renamable $x10
+  ; RV64I-NEXT:   renamable $x11 = LUI 1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT: bb.1.entry:
+  ; RV64I-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; RV64I-NEXT:   liveins: $x10, $x11
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = SUB $x2, renamable $x11
+  ; RV64I-NEXT:   SD $x0, $x2, 0
+  ; RV64I-NEXT:   BLT renamable $x10, $x2, %bb.1
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT: bb.2.entry:
+  ; RV64I-NEXT:   liveins: $x10
+  ; RV64I-NEXT: {{  $}}
+  ; RV64I-NEXT:   $x2 = ADDI renamable $x10, 0
+  ; RV64I-NEXT:   $x11 = LUI 1
+  ; RV64I-NEXT:   $x2 = SUB $x2, killed $x11
+  ; RV64I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+  ; RV64I-NEXT:   $x10 = LUI 1
+  ; RV64I-NEXT:   $x2 = ADD $x2, killed $x10
+  ; RV64I-NEXT:   $x2 = frame-destroy ADDI $x8, -16
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 16
+  ; RV64I-NEXT:   $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.1)
+  ; RV64I-NEXT:   $x8 = frame-destroy LD $x2, 0 :: (load (s64) from %stack.2)
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
+  ; RV64I-NEXT:   $x2 = frame-destroy ADDI $x2, 16
+  ; RV64I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; RV64I-NEXT:   PseudoRET
+  ;
+  ; RV32I-LABEL: name: no_reserved_call_frame
+  ; RV32I: bb.0.entry:
+  ; RV32I-NEXT:   successors: %bb.1(0x80000000)
+  ; RV32I-NEXT:   liveins: $x10, $x1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = frame-setup ADDI $x2, -16
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
+  ; RV32I-NEXT:   frame-setup SW killed $x1, $x2, 12 :: (store (s32) into %stack.1)
+  ; RV32I-NEXT:   frame-setup SW killed $x8, $x2, 8 :: (store (s32) into %stack.2)
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -4
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -8
+  ; RV32I-NEXT:   $x8 = frame-setup ADDI $x2, 16
+  ; RV32I-NEXT:   frame-setup CFI_INSTRUCTION def_cfa $x8, 0
+  ; RV32I-NEXT:   renamable $x10 = SLLI killed renamable $x10, 2
+  ; RV32I-NEXT:   renamable $x10 = nuw ADDI killed renamable $x10, 15
+  ; RV32I-NEXT:   renamable $x10 = ANDI killed renamable $x10, -16
+  ; RV32I-NEXT:   renamable $x10 = SUB $x2, killed renamable $x10
+  ; RV32I-NEXT:   renamable $x11 = LUI 1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT: bb.1.entry:
+  ; RV32I-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; RV32I-NEXT:   liveins: $x10, $x11
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = SUB $x2, renamable $x11
+  ; RV32I-NEXT:   SD $x0, $x2, 0
+  ; RV32I-NEXT:   BLT renamable $x10, $x2, %bb.1
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT: bb.2.entry:
+  ; RV32I-NEXT:   liveins: $x10
+  ; RV32I-NEXT: {{  $}}
+  ; RV32I-NEXT:   $x2 = ADDI renamable $x10, 0
+  ; RV32I-NEXT:   $x11 = LUI 1
+  ; RV32I-NEXT:   $x2 = SUB $x2, killed $x11
+  ; RV32I-NEXT:   PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+  ; RV32I-NEXT:   $x10 = LUI 1
+  ; RV32I-NEXT:   $x2 = ADD $x2, killed $x10
+  ; RV32I-NEXT:   $x2 = frame-destroy ADDI $x8, -16
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 16
+  ; RV32I-NEXT:   $x1 = frame-destroy LW $x2, 12 :: (load (s32) from %stack.1)
+  ; RV32I-NEXT:   $x8 = frame-destroy LW $x2, 8 :: (load (s32) from %stack.2)
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
+  ; RV32I-NEXT:   $x2 = frame-destroy ADDI $x2, 16
+  ; RV32I-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+  ; RV32I-NEXT:   PseudoRET
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $x10
+
+    renamable $x10 = SLLI killed renamable $x10, 2
+    renamable $x10 = nuw ADDI killed renamable $x10, 15
+    renamable $x10 = ANDI killed renamable $x10, -16
+    renamable $x10 = SUB $x2, killed renamable $x10
+    renamable $x11 = LUI 1
+
+  bb.1.entry:
+    successors: %bb.2(0x40000000), %bb.1(0x40000000)
+    liveins: $x10, $x11
+
+    $x2 = SUB $x2, renamable $x11
+    SD $x0, $x2, 0
+    BLT renamable $x10, $x2, %bb.1
+
+  bb.2.entry:
+    liveins: $x10
+
+    $x2 = ADDI renamable $x10, 0
+    ADJCALLSTACKDOWN 4088, 0, implicit-def dead $x2, implicit $x2
+    PseudoCALL target-flags(riscv-call) @callee_stack_args, csr_ilp32_lp64, implicit-def dead $x1, implicit $x10, implicit undef $x11, implicit undef $x12, implicit undef $x13, implicit undef $x14, implicit undef $x15, implicit undef $x16, implicit undef $x17, implicit-def $x2
+    ADJCALLSTACKUP 4088, 0, implicit-def dead $x2, implicit $x2
+    PseudoRET
+...
diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
index 32753ca382fc..cd7f30d8f589 100644
--- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll
@@ -716,92 +716,101 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
-; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    mv t0, sp
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srli t3, a0, 3
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    andi a5, a0, 31
-; RV32I-NEXT:    andi t3, t3, 12
-; RV32I-NEXT:    xori a5, a5, 31
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    add t0, t0, t3
-; RV32I-NEXT:    sw a4, 0(sp)
-; RV32I-NEXT:    sw a3, 4(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
-; RV32I-NEXT:    lw a1, 4(t0)
-; RV32I-NEXT:    lw a3, 8(t0)
-; RV32I-NEXT:    lw a4, 0(t0)
-; RV32I-NEXT:    lw a6, 12(t0)
-; RV32I-NEXT:    srl a7, a1, a0
-; RV32I-NEXT:    slli t0, a3, 1
-; RV32I-NEXT:    srl a4, a4, a0
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli t1, a6, 1
-; RV32I-NEXT:    srl a0, a6, a0
-; RV32I-NEXT:    sll a6, t0, a5
-; RV32I-NEXT:    sll a1, a1, a5
-; RV32I-NEXT:    sll a5, t1, a5
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    mv t2, sp
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a3, 0(sp)
+; RV32I-NEXT:    sw a4, 4(sp)
+; RV32I-NEXT:    sw a5, 8(sp)
+; RV32I-NEXT:    sw a0, 12(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a0, t2, a0
+; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a6, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a7, a4, a1
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    srl a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a4, a4, a3
+; RV32I-NEXT:    sll a3, t1, a3
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, a4, a1
-; RV32I-NEXT:    or a3, a3, a5
+; RV32I-NEXT:    or a1, a7, a1
+; RV32I-NEXT:    or a4, a6, a4
+; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
 ; RV32I-NEXT:    sb t1, 15(a2)
 ; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 8
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    srli t1, a1, 8
-; RV32I-NEXT:    srli t2, a6, 16
-; RV32I-NEXT:    srli t3, a6, 24
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a6, 8
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a6, 4(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -943,93 +952,102 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    or a4, a6, a5
+; RV32I-NEXT:    lbu a5, 8(a0)
+; RV32I-NEXT:    lbu a6, 9(a0)
+; RV32I-NEXT:    lbu t3, 10(a0)
+; RV32I-NEXT:    lbu t4, 11(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a4, a4, a3
-; RV32I-NEXT:    or a5, a6, a5
-; RV32I-NEXT:    or a3, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
-; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli a6, a6, 8
+; RV32I-NEXT:    or a7, t0, a7
+; RV32I-NEXT:    or t0, t2, t1
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    lbu a6, 12(a0)
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t4, t4, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t3, t4, t3
+; RV32I-NEXT:    or a6, t1, a6
+; RV32I-NEXT:    or a0, a0, t2
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t2, 0(a1)
+; RV32I-NEXT:    lbu t4, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t2
 ; RV32I-NEXT:    sw zero, 0(sp)
 ; RV32I-NEXT:    sw zero, 4(sp)
 ; RV32I-NEXT:    sw zero, 8(sp)
 ; RV32I-NEXT:    sw zero, 12(sp)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    addi t0, sp, 16
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srli t3, a0, 3
-; RV32I-NEXT:    or a4, a5, a4
-; RV32I-NEXT:    andi a5, a0, 31
-; RV32I-NEXT:    andi t3, t3, 12
-; RV32I-NEXT:    or a3, t1, a3
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    sub a7, t0, t3
-; RV32I-NEXT:    sw a4, 16(sp)
-; RV32I-NEXT:    sw a3, 20(sp)
-; RV32I-NEXT:    sw a6, 24(sp)
-; RV32I-NEXT:    sw a1, 28(sp)
-; RV32I-NEXT:    lw a1, 0(a7)
-; RV32I-NEXT:    lw a3, 4(a7)
-; RV32I-NEXT:    lw a4, 8(a7)
-; RV32I-NEXT:    lw a6, 12(a7)
-; RV32I-NEXT:    xori a5, a5, 31
-; RV32I-NEXT:    sll a7, a3, a0
-; RV32I-NEXT:    srli t0, a1, 1
-; RV32I-NEXT:    sll a6, a6, a0
-; RV32I-NEXT:    srli t1, a4, 1
-; RV32I-NEXT:    sll a4, a4, a0
-; RV32I-NEXT:    srli a3, a3, 1
-; RV32I-NEXT:    sll a0, a1, a0
-; RV32I-NEXT:    srl a1, t0, a5
-; RV32I-NEXT:    srl t0, t1, a5
-; RV32I-NEXT:    srl a3, a3, a5
-; RV32I-NEXT:    srli a5, a0, 16
-; RV32I-NEXT:    srli t1, a0, 24
-; RV32I-NEXT:    srli t2, a0, 8
-; RV32I-NEXT:    or a1, a7, a1
-; RV32I-NEXT:    or a6, a6, t0
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t4
+; RV32I-NEXT:    addi t2, sp, 16
 ; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    sb a0, 0(a2)
+; RV32I-NEXT:    or a4, t0, a7
+; RV32I-NEXT:    or a5, t3, a5
+; RV32I-NEXT:    or a0, a0, a6
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a3, 16(sp)
+; RV32I-NEXT:    sw a4, 20(sp)
+; RV32I-NEXT:    sw a5, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    sub a0, t2, a0
+; RV32I-NEXT:    lw a4, 0(a0)
+; RV32I-NEXT:    lw a5, 4(a0)
+; RV32I-NEXT:    lw a6, 8(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    sll a7, a5, a1
+; RV32I-NEXT:    srli t0, a4, 1
+; RV32I-NEXT:    sll a0, a0, a1
+; RV32I-NEXT:    srli t1, a6, 1
+; RV32I-NEXT:    sll a6, a6, a1
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll a1, a4, a1
+; RV32I-NEXT:    srl a4, t0, a3
+; RV32I-NEXT:    srl t0, t1, a3
+; RV32I-NEXT:    srl a3, a5, a3
+; RV32I-NEXT:    srli a5, a1, 16
+; RV32I-NEXT:    srli t1, a1, 24
+; RV32I-NEXT:    srli t2, a1, 8
+; RV32I-NEXT:    or a4, a7, a4
+; RV32I-NEXT:    or a0, a0, t0
+; RV32I-NEXT:    or a3, a6, a3
+; RV32I-NEXT:    sb a1, 0(a2)
 ; RV32I-NEXT:    sb t2, 1(a2)
 ; RV32I-NEXT:    sb a5, 2(a2)
 ; RV32I-NEXT:    sb t1, 3(a2)
-; RV32I-NEXT:    srli a0, a3, 16
-; RV32I-NEXT:    srli a4, a3, 24
-; RV32I-NEXT:    srli a5, a3, 8
-; RV32I-NEXT:    srli a7, a6, 16
-; RV32I-NEXT:    srli t0, a6, 24
-; RV32I-NEXT:    srli t1, a6, 8
-; RV32I-NEXT:    srli t2, a1, 16
-; RV32I-NEXT:    srli t3, a1, 24
+; RV32I-NEXT:    srli a1, a3, 16
+; RV32I-NEXT:    srli a5, a3, 24
+; RV32I-NEXT:    srli a6, a3, 8
+; RV32I-NEXT:    srli a7, a0, 16
+; RV32I-NEXT:    srli t0, a0, 24
+; RV32I-NEXT:    srli t1, a0, 8
+; RV32I-NEXT:    srli t2, a4, 16
+; RV32I-NEXT:    srli t3, a4, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
-; RV32I-NEXT:    sb a5, 9(a2)
-; RV32I-NEXT:    sb a0, 10(a2)
-; RV32I-NEXT:    sb a4, 11(a2)
-; RV32I-NEXT:    srli a0, a1, 8
-; RV32I-NEXT:    sb a6, 12(a2)
+; RV32I-NEXT:    sb a6, 9(a2)
+; RV32I-NEXT:    sb a1, 10(a2)
+; RV32I-NEXT:    sb a5, 11(a2)
+; RV32I-NEXT:    srli a1, a4, 8
+; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t1, 13(a2)
 ; RV32I-NEXT:    sb a7, 14(a2)
 ; RV32I-NEXT:    sb t0, 15(a2)
-; RV32I-NEXT:    sb a1, 4(a2)
-; RV32I-NEXT:    sb a0, 5(a2)
+; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
 ; RV32I-NEXT:    addi sp, sp, 32
@@ -1168,73 +1186,82 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu t1, 6(a0)
 ; RV32I-NEXT:    lbu t2, 7(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
+; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    lbu a4, 8(a0)
+; RV32I-NEXT:    lbu t3, 9(a0)
+; RV32I-NEXT:    lbu t4, 10(a0)
+; RV32I-NEXT:    lbu t5, 11(a0)
 ; RV32I-NEXT:    slli a5, a5, 16
 ; RV32I-NEXT:    slli a6, a6, 24
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    lbu a6, 8(a0)
-; RV32I-NEXT:    lbu a7, 9(a0)
-; RV32I-NEXT:    lbu t0, 10(a0)
-; RV32I-NEXT:    lbu t3, 11(a0)
 ; RV32I-NEXT:    slli t1, t1, 16
 ; RV32I-NEXT:    slli t2, t2, 24
-; RV32I-NEXT:    slli a7, a7, 8
-; RV32I-NEXT:    slli t0, t0, 16
-; RV32I-NEXT:    slli t3, t3, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a7, t3, t0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    lbu t0, 12(a0)
-; RV32I-NEXT:    lbu t2, 13(a0)
-; RV32I-NEXT:    lbu t3, 14(a0)
-; RV32I-NEXT:    lbu t4, 15(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    or a1, t2, t0
-; RV32I-NEXT:    mv t0, sp
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    srli a4, a0, 3
-; RV32I-NEXT:    or a5, t1, a5
-; RV32I-NEXT:    andi t1, a0, 31
-; RV32I-NEXT:    or t2, t4, t3
-; RV32I-NEXT:    srai t3, t4, 31
-; RV32I-NEXT:    andi a4, a4, 12
-; RV32I-NEXT:    xori t1, t1, 31
+; RV32I-NEXT:    lbu t1, 13(a0)
+; RV32I-NEXT:    lbu t2, 14(a0)
+; RV32I-NEXT:    lbu a0, 15(a0)
+; RV32I-NEXT:    slli t3, t3, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or t3, t5, t4
+; RV32I-NEXT:    or t0, t1, t0
+; RV32I-NEXT:    lbu t1, 1(a1)
+; RV32I-NEXT:    lbu t4, 0(a1)
+; RV32I-NEXT:    lbu t5, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli t1, t1, 8
+; RV32I-NEXT:    or t1, t1, t4
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or a1, a1, t5
+; RV32I-NEXT:    or a3, a5, a3
+; RV32I-NEXT:    mv a5, sp
+; RV32I-NEXT:    slli t2, t2, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    or t2, a0, t2
+; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    or a6, a7, a6
-; RV32I-NEXT:    or a1, t2, a1
-; RV32I-NEXT:    sw t3, 16(sp)
-; RV32I-NEXT:    sw t3, 20(sp)
-; RV32I-NEXT:    sw t3, 24(sp)
-; RV32I-NEXT:    sw t3, 28(sp)
-; RV32I-NEXT:    add a4, t0, a4
+; RV32I-NEXT:    or a4, t3, a4
+; RV32I-NEXT:    or a7, t2, t0
+; RV32I-NEXT:    or a1, a1, t1
+; RV32I-NEXT:    sw a0, 16(sp)
+; RV32I-NEXT:    sw a0, 20(sp)
+; RV32I-NEXT:    sw a0, 24(sp)
+; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    sw a3, 0(sp)
-; RV32I-NEXT:    sw a5, 4(sp)
-; RV32I-NEXT:    sw a6, 8(sp)
-; RV32I-NEXT:    sw a1, 12(sp)
-; RV32I-NEXT:    lw a1, 4(a4)
-; RV32I-NEXT:    lw a3, 8(a4)
-; RV32I-NEXT:    lw a5, 0(a4)
-; RV32I-NEXT:    lw a4, 12(a4)
-; RV32I-NEXT:    srl a6, a1, a0
-; RV32I-NEXT:    slli a7, a3, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a1, a1, 1
-; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli t0, a4, 1
-; RV32I-NEXT:    sra a0, a4, a0
-; RV32I-NEXT:    sll a4, a7, t1
-; RV32I-NEXT:    sll a1, a1, t1
-; RV32I-NEXT:    sll a7, t0, t1
+; RV32I-NEXT:    sw a6, 4(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a7, 12(sp)
+; RV32I-NEXT:    srli a0, a1, 3
+; RV32I-NEXT:    andi a3, a1, 31
+; RV32I-NEXT:    andi a0, a0, 12
+; RV32I-NEXT:    xori a3, a3, 31
+; RV32I-NEXT:    add a0, a5, a0
+; RV32I-NEXT:    lw a4, 4(a0)
+; RV32I-NEXT:    lw a5, 8(a0)
+; RV32I-NEXT:    lw a6, 0(a0)
+; RV32I-NEXT:    lw a0, 12(a0)
+; RV32I-NEXT:    srl a7, a4, a1
+; RV32I-NEXT:    slli t0, a5, 1
+; RV32I-NEXT:    srl a6, a6, a1
+; RV32I-NEXT:    slli a4, a4, 1
+; RV32I-NEXT:    srl a5, a5, a1
+; RV32I-NEXT:    slli t1, a0, 1
+; RV32I-NEXT:    sra a0, a0, a1
+; RV32I-NEXT:    sll a1, t0, a3
+; RV32I-NEXT:    sll a4, a4, a3
+; RV32I-NEXT:    sll a3, t1, a3
 ; RV32I-NEXT:    srli t0, a0, 16
 ; RV32I-NEXT:    srli t1, a0, 24
 ; RV32I-NEXT:    srli t2, a0, 8
+; RV32I-NEXT:    or a1, a7, a1
 ; RV32I-NEXT:    or a4, a6, a4
-; RV32I-NEXT:    or a1, a5, a1
-; RV32I-NEXT:    or a3, a3, a7
+; RV32I-NEXT:    or a3, a5, a3
 ; RV32I-NEXT:    sb a0, 12(a2)
 ; RV32I-NEXT:    sb t2, 13(a2)
 ; RV32I-NEXT:    sb t0, 14(a2)
@@ -1242,21 +1269,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    srli a0, a3, 16
 ; RV32I-NEXT:    srli a5, a3, 24
 ; RV32I-NEXT:    srli a6, a3, 8
-; RV32I-NEXT:    srli a7, a1, 16
-; RV32I-NEXT:    srli t0, a1, 24
-; RV32I-NEXT:    srli t1, a1, 8
-; RV32I-NEXT:    srli t2, a4, 16
-; RV32I-NEXT:    srli t3, a4, 24
+; RV32I-NEXT:    srli a7, a4, 16
+; RV32I-NEXT:    srli t0, a4, 24
+; RV32I-NEXT:    srli t1, a4, 8
+; RV32I-NEXT:    srli t2, a1, 16
+; RV32I-NEXT:    srli t3, a1, 24
 ; RV32I-NEXT:    sb a3, 8(a2)
 ; RV32I-NEXT:    sb a6, 9(a2)
 ; RV32I-NEXT:    sb a0, 10(a2)
 ; RV32I-NEXT:    sb a5, 11(a2)
-; RV32I-NEXT:    srli a0, a4, 8
-; RV32I-NEXT:    sb a1, 0(a2)
+; RV32I-NEXT:    srli a0, a1, 8
+; RV32I-NEXT:    sb a4, 0(a2)
 ; RV32I-NEXT:    sb t1, 1(a2)
 ; RV32I-NEXT:    sb a7, 2(a2)
 ; RV32I-NEXT:    sb t0, 3(a2)
-; RV32I-NEXT:    sb a4, 4(a2)
+; RV32I-NEXT:    sb a1, 4(a2)
 ; RV32I-NEXT:    sb a0, 5(a2)
 ; RV32I-NEXT:    sb t2, 6(a2)
 ; RV32I-NEXT:    sb t3, 7(a2)
@@ -1272,17 +1299,19 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: lshr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1299,122 +1328,143 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 32(sp)
 ; RV64I-NEXT:    sd zero, 40(sp)
 ; RV64I-NEXT:    sd zero, 48(sp)
 ; RV64I-NEXT:    sd zero, 56(sp)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    mv s2, sp
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    xori s5, s5, 63
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    add s2, s2, s3
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    mv a6, sp
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a1, 8(s2)
-; RV64I-NEXT:    ld a3, 16(s2)
-; RV64I-NEXT:    ld a4, 0(s2)
-; RV64I-NEXT:    ld a5, 24(s2)
-; RV64I-NEXT:    srl a6, a1, a0
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    srl a4, a4, a0
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    srl a3, a3, a0
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    add a0, a6, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    srl a5, a5, a0
-; RV64I-NEXT:    sll a0, a7, s5
-; RV64I-NEXT:    sll a1, a1, s5
-; RV64I-NEXT:    sll a7, t0, s5
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
-; RV64I-NEXT:    srli t4, a5, 24
-; RV64I-NEXT:    srli t5, a5, 16
-; RV64I-NEXT:    srli t6, a5, 8
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a3, a3, a7
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    srl t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a4, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a4, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t2, 29(a2)
-; RV64I-NEXT:    sb t1, 30(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a5, 24(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -1463,17 +1513,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: lshr_32bytes:
@@ -1498,55 +1550,67 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s7, 12(a0)
-; RV32I-NEXT:    lbu s8, 13(a0)
-; RV32I-NEXT:    lbu s9, 14(a0)
-; RV32I-NEXT:    lbu s10, 15(a0)
-; RV32I-NEXT:    lbu s11, 16(a0)
-; RV32I-NEXT:    lbu ra, 17(a0)
-; RV32I-NEXT:    lbu t4, 18(a0)
-; RV32I-NEXT:    lbu s0, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t6, t3
-; RV32I-NEXT:    or a7, s3, s2
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or s7, s8, s7
-; RV32I-NEXT:    or s2, s10, s9
-; RV32I-NEXT:    or s3, ra, s11
-; RV32I-NEXT:    lbu s4, 28(a0)
-; RV32I-NEXT:    lbu s8, 29(a0)
-; RV32I-NEXT:    lbu s9, 30(a0)
-; RV32I-NEXT:    lbu s10, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    sw zero, 56(sp)
 ; RV32I-NEXT:    sw zero, 60(sp)
 ; RV32I-NEXT:    sw zero, 64(sp)
@@ -1555,90 +1619,89 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 44(sp)
 ; RV32I-NEXT:    sw zero, 48(sp)
 ; RV32I-NEXT:    sw zero, 52(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 8
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t4, s0, t4
-; RV32I-NEXT:    addi s0, sp, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw t2, 32(sp)
+; RV32I-NEXT:    sw t3, 36(sp)
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s5, t3
-; RV32I-NEXT:    or t6, t6, s6
-; RV32I-NEXT:    or s1, s8, s4
-; RV32I-NEXT:    or s4, s10, s9
-; RV32I-NEXT:    andi s5, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, s7
-; RV32I-NEXT:    or a7, t4, s3
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, t6, t3
-; RV32I-NEXT:    or t2, s4, s1
-; RV32I-NEXT:    add s0, s0, s5
-; RV32I-NEXT:    sw a7, 24(sp)
-; RV32I-NEXT:    sw t0, 28(sp)
-; RV32I-NEXT:    sw t1, 32(sp)
-; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
-; RV32I-NEXT:    lw a3, 0(s0)
-; RV32I-NEXT:    lw a4, 4(s0)
-; RV32I-NEXT:    lw a5, 8(s0)
-; RV32I-NEXT:    lw a6, 12(s0)
-; RV32I-NEXT:    lw a7, 16(s0)
-; RV32I-NEXT:    lw t0, 20(s0)
-; RV32I-NEXT:    lw t1, 24(s0)
-; RV32I-NEXT:    lw t2, 28(s0)
-; RV32I-NEXT:    srl t3, a4, a0
-; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s3, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl t5, a6, a0
-; RV32I-NEXT:    slli t6, a7, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    srl s0, t0, a0
-; RV32I-NEXT:    slli s1, t1, 1
-; RV32I-NEXT:    srl a7, a7, a0
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    slli s2, t2, 1
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
 ; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    srl s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll t4, t6, a1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    sll t6, s1, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll s1, s2, a1
-; RV32I-NEXT:    srli s2, t2, 24
-; RV32I-NEXT:    srli s3, t2, 16
-; RV32I-NEXT:    srli s4, t2, 8
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, t4
-; RV32I-NEXT:    or a4, a5, a6
-; RV32I-NEXT:    or a5, s0, t6
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, s1
-; RV32I-NEXT:    sb t2, 28(a2)
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s3, 30(a2)
-; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
@@ -1712,17 +1775,19 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: shl_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -1739,125 +1804,146 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli s8, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a5, a4, a3
+; RV64I-NEXT:    or a6, a6, s8
+; RV64I-NEXT:    or a3, t0, a7
+; RV64I-NEXT:    or a4, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
+; RV64I-NEXT:    slli s5, s5, 16
+; RV64I-NEXT:    slli s6, s6, 24
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
 ; RV64I-NEXT:    sd zero, 0(sp)
 ; RV64I-NEXT:    sd zero, 8(sp)
 ; RV64I-NEXT:    sd zero, 16(sp)
 ; RV64I-NEXT:    sd zero, 24(sp)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    addi s2, sp, 32
-; RV64I-NEXT:    slli s4, s4, 8
-; RV64I-NEXT:    slli s5, s5, 16
-; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    sub t2, s2, s3
-; RV64I-NEXT:    slli a4, a4, 32
-; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t1, t1, 32
+; RV64I-NEXT:    or a5, a6, a5
+; RV64I-NEXT:    addi a6, sp, 32
 ; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t1, a1
+; RV64I-NEXT:    or a4, t0, a7
+; RV64I-NEXT:    or a7, t2, t1
+; RV64I-NEXT:    or t0, t4, t3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t1, s0, t6
+; RV64I-NEXT:    or t2, s5, s1
+; RV64I-NEXT:    or t3, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    slli a7, a7, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t2, t2, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    or a3, a3, a5
+; RV64I-NEXT:    or a4, a7, a4
+; RV64I-NEXT:    or a0, a0, t0
+; RV64I-NEXT:    or a5, t2, t1
+; RV64I-NEXT:    or a1, a1, t3
 ; RV64I-NEXT:    sd a3, 32(sp)
 ; RV64I-NEXT:    sd a4, 40(sp)
-; RV64I-NEXT:    sd a5, 48(sp)
-; RV64I-NEXT:    sd a1, 56(sp)
-; RV64I-NEXT:    ld a1, 0(t2)
-; RV64I-NEXT:    ld a3, 8(t2)
-; RV64I-NEXT:    ld a4, 16(t2)
-; RV64I-NEXT:    ld a5, 24(t2)
-; RV64I-NEXT:    xori a6, s5, 63
-; RV64I-NEXT:    sll a7, a3, a0
-; RV64I-NEXT:    srli t0, a1, 1
-; RV64I-NEXT:    sll a5, a5, a0
-; RV64I-NEXT:    srli t1, a4, 1
-; RV64I-NEXT:    sll a4, a4, a0
-; RV64I-NEXT:    srli a3, a3, 1
-; RV64I-NEXT:    sll t2, a1, a0
-; RV64I-NEXT:    srl a0, t0, a6
-; RV64I-NEXT:    srl a1, t1, a6
-; RV64I-NEXT:    srl a3, a3, a6
-; RV64I-NEXT:    srli a6, t2, 56
-; RV64I-NEXT:    srli t0, t2, 48
-; RV64I-NEXT:    srli t1, t2, 40
-; RV64I-NEXT:    srli t3, t2, 32
-; RV64I-NEXT:    srli t4, t2, 24
-; RV64I-NEXT:    srli t5, t2, 16
-; RV64I-NEXT:    srli t6, t2, 8
-; RV64I-NEXT:    or a0, a7, a0
-; RV64I-NEXT:    or a1, a5, a1
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    sb t3, 4(a2)
-; RV64I-NEXT:    sb t1, 5(a2)
-; RV64I-NEXT:    sb t0, 6(a2)
-; RV64I-NEXT:    sb a6, 7(a2)
-; RV64I-NEXT:    sb t2, 0(a2)
-; RV64I-NEXT:    sb t6, 1(a2)
-; RV64I-NEXT:    sb t5, 2(a2)
-; RV64I-NEXT:    sb t4, 3(a2)
+; RV64I-NEXT:    sd a0, 48(sp)
+; RV64I-NEXT:    sd a5, 56(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    sub a0, a6, a0
+; RV64I-NEXT:    ld a4, 0(a0)
+; RV64I-NEXT:    ld a5, 8(a0)
+; RV64I-NEXT:    ld a6, 16(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    sll a7, a5, a1
+; RV64I-NEXT:    srli t0, a4, 1
+; RV64I-NEXT:    sll t1, a0, a1
+; RV64I-NEXT:    srli a0, a6, 1
+; RV64I-NEXT:    sll a6, a6, a1
+; RV64I-NEXT:    srli a5, a5, 1
+; RV64I-NEXT:    sll a4, a4, a1
+; RV64I-NEXT:    srl a1, t0, a3
+; RV64I-NEXT:    srl t0, a0, a3
+; RV64I-NEXT:    srl a3, a5, a3
+; RV64I-NEXT:    srli a5, a4, 56
+; RV64I-NEXT:    srli t2, a4, 48
+; RV64I-NEXT:    srli t3, a4, 40
+; RV64I-NEXT:    srli t4, a4, 32
+; RV64I-NEXT:    srli t5, a4, 24
+; RV64I-NEXT:    srli t6, a4, 16
+; RV64I-NEXT:    srli s0, a4, 8
+; RV64I-NEXT:    or a0, a7, a1
+; RV64I-NEXT:    or a1, t1, t0
+; RV64I-NEXT:    or a3, a6, a3
+; RV64I-NEXT:    sb t4, 4(a2)
+; RV64I-NEXT:    sb t3, 5(a2)
+; RV64I-NEXT:    sb t2, 6(a2)
+; RV64I-NEXT:    sb a5, 7(a2)
+; RV64I-NEXT:    sb a4, 0(a2)
+; RV64I-NEXT:    sb s0, 1(a2)
+; RV64I-NEXT:    sb t6, 2(a2)
+; RV64I-NEXT:    sb t5, 3(a2)
 ; RV64I-NEXT:    srli a4, a3, 56
 ; RV64I-NEXT:    srli a5, a3, 48
 ; RV64I-NEXT:    srli a6, a3, 40
@@ -1903,17 +1989,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    sb a1, 9(a2)
 ; RV64I-NEXT:    sb a5, 10(a2)
 ; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: shl_32bytes:
@@ -1938,55 +2026,67 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t6, 7(a0)
-; RV32I-NEXT:    lbu s2, 8(a0)
-; RV32I-NEXT:    lbu s3, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s7, 12(a0)
-; RV32I-NEXT:    lbu s8, 13(a0)
-; RV32I-NEXT:    lbu s9, 14(a0)
-; RV32I-NEXT:    lbu s10, 15(a0)
-; RV32I-NEXT:    lbu s11, 16(a0)
-; RV32I-NEXT:    lbu ra, 17(a0)
-; RV32I-NEXT:    lbu t4, 18(a0)
-; RV32I-NEXT:    lbu s0, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s2, 13(a0)
+; RV32I-NEXT:    lbu s4, 14(a0)
+; RV32I-NEXT:    lbu s5, 15(a0)
+; RV32I-NEXT:    lbu s6, 16(a0)
+; RV32I-NEXT:    lbu s7, 17(a0)
+; RV32I-NEXT:    lbu s8, 18(a0)
+; RV32I-NEXT:    lbu s9, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s10, 20(a0)
+; RV32I-NEXT:    lbu s11, 21(a0)
+; RV32I-NEXT:    lbu ra, 22(a0)
+; RV32I-NEXT:    lbu a3, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
 ; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or a5, t0, a5
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu s1, 24(a0)
+; RV32I-NEXT:    lbu s3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s2, s2, 8
 ; RV32I-NEXT:    slli s4, s4, 16
 ; RV32I-NEXT:    slli s5, s5, 24
-; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t6, t3
-; RV32I-NEXT:    or a7, s3, s2
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s5, 25(a0)
-; RV32I-NEXT:    lbu s6, 26(a0)
-; RV32I-NEXT:    lbu t6, 27(a0)
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    slli ra, ra, 8
-; RV32I-NEXT:    or s7, s8, s7
-; RV32I-NEXT:    or s2, s10, s9
-; RV32I-NEXT:    or s3, ra, s11
-; RV32I-NEXT:    lbu s4, 28(a0)
-; RV32I-NEXT:    lbu s8, 29(a0)
-; RV32I-NEXT:    lbu s9, 30(a0)
-; RV32I-NEXT:    lbu s10, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, s5, s4
+; RV32I-NEXT:    or t3, s7, s6
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s4, 29(a0)
+; RV32I-NEXT:    lbu s5, 30(a0)
+; RV32I-NEXT:    lbu s6, 31(a0)
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli s9, s9, 24
+; RV32I-NEXT:    slli s11, s11, 8
+; RV32I-NEXT:    slli ra, ra, 16
+; RV32I-NEXT:    slli a3, a3, 24
+; RV32I-NEXT:    or a0, s9, s8
+; RV32I-NEXT:    or s0, s11, s10
+; RV32I-NEXT:    or s2, a3, ra
+; RV32I-NEXT:    lbu a3, 0(a1)
+; RV32I-NEXT:    lbu s7, 1(a1)
+; RV32I-NEXT:    lbu s8, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
 ; RV32I-NEXT:    sw zero, 24(sp)
 ; RV32I-NEXT:    sw zero, 28(sp)
 ; RV32I-NEXT:    sw zero, 32(sp)
@@ -1995,89 +2095,88 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    sw zero, 12(sp)
 ; RV32I-NEXT:    sw zero, 16(sp)
 ; RV32I-NEXT:    sw zero, 20(sp)
+; RV32I-NEXT:    slli s3, s3, 8
+; RV32I-NEXT:    or s1, s3, s1
+; RV32I-NEXT:    addi s3, sp, 40
 ; RV32I-NEXT:    slli t4, t4, 16
-; RV32I-NEXT:    slli s0, s0, 24
-; RV32I-NEXT:    or t4, s0, t4
-; RV32I-NEXT:    addi s0, sp, 40
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s5, s5, 8
-; RV32I-NEXT:    slli s6, s6, 16
-; RV32I-NEXT:    slli t6, t6, 24
-; RV32I-NEXT:    slli s8, s8, 8
-; RV32I-NEXT:    slli s9, s9, 16
-; RV32I-NEXT:    slli s10, s10, 24
-; RV32I-NEXT:    or t1, t2, t1
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s4, s4, 8
+; RV32I-NEXT:    slli s5, s5, 16
+; RV32I-NEXT:    slli s6, s6, 24
+; RV32I-NEXT:    slli s7, s7, 8
+; RV32I-NEXT:    slli s8, s8, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s4, t6
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a3, s7, a3
+; RV32I-NEXT:    or a1, a1, s8
+; RV32I-NEXT:    lw s4, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, s4
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, a0, t3
+; RV32I-NEXT:    or t1, s2, s0
+; RV32I-NEXT:    or t2, t4, s1
+; RV32I-NEXT:    or t3, t6, t5
+; RV32I-NEXT:    or a0, a1, a3
+; RV32I-NEXT:    sw t0, 56(sp)
+; RV32I-NEXT:    sw t1, 60(sp)
+; RV32I-NEXT:    sw t2, 64(sp)
+; RV32I-NEXT:    sw t3, 68(sp)
+; RV32I-NEXT:    sw a4, 40(sp)
+; RV32I-NEXT:    sw a5, 44(sp)
+; RV32I-NEXT:    sw a6, 48(sp)
+; RV32I-NEXT:    sw a7, 52(sp)
 ; RV32I-NEXT:    srli a1, a0, 3
-; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s5, t3
-; RV32I-NEXT:    or t6, t6, s6
-; RV32I-NEXT:    or s1, s8, s4
-; RV32I-NEXT:    or s4, s10, s9
-; RV32I-NEXT:    andi s5, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, s2, s7
-; RV32I-NEXT:    or a7, t4, s3
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, t6, t3
-; RV32I-NEXT:    or t2, s4, s1
-; RV32I-NEXT:    sub t3, s0, s5
-; RV32I-NEXT:    sw a7, 56(sp)
-; RV32I-NEXT:    sw t0, 60(sp)
-; RV32I-NEXT:    sw t1, 64(sp)
-; RV32I-NEXT:    sw t2, 68(sp)
-; RV32I-NEXT:    sw a3, 40(sp)
-; RV32I-NEXT:    sw a4, 44(sp)
-; RV32I-NEXT:    sw a5, 48(sp)
-; RV32I-NEXT:    sw a6, 52(sp)
-; RV32I-NEXT:    lw a3, 0(t3)
-; RV32I-NEXT:    lw a4, 4(t3)
-; RV32I-NEXT:    lw a5, 8(t3)
-; RV32I-NEXT:    lw a6, 12(t3)
-; RV32I-NEXT:    lw a7, 16(t3)
-; RV32I-NEXT:    lw t0, 20(t3)
-; RV32I-NEXT:    lw t1, 24(t3)
-; RV32I-NEXT:    lw t2, 28(t3)
-; RV32I-NEXT:    sll t3, a4, a0
-; RV32I-NEXT:    srli t4, a3, 1
-; RV32I-NEXT:    sll t5, a6, a0
-; RV32I-NEXT:    srli t6, a5, 1
-; RV32I-NEXT:    sll a5, a5, a0
-; RV32I-NEXT:    srli a4, a4, 1
-; RV32I-NEXT:    sll s0, t0, a0
-; RV32I-NEXT:    srli s1, a7, 1
-; RV32I-NEXT:    sll a7, a7, a0
-; RV32I-NEXT:    srli a6, a6, 1
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    sub a3, s3, a4
+; RV32I-NEXT:    lw a4, 0(a3)
+; RV32I-NEXT:    lw a5, 4(a3)
+; RV32I-NEXT:    lw a6, 8(a3)
+; RV32I-NEXT:    lw a7, 12(a3)
+; RV32I-NEXT:    lw t0, 16(a3)
+; RV32I-NEXT:    lw t1, 20(a3)
+; RV32I-NEXT:    lw t2, 24(a3)
+; RV32I-NEXT:    lw a3, 28(a3)
+; RV32I-NEXT:    sll t3, a5, a0
+; RV32I-NEXT:    srli t4, a4, 1
+; RV32I-NEXT:    sll t5, a7, a0
+; RV32I-NEXT:    srli t6, a6, 1
+; RV32I-NEXT:    sll a6, a6, a0
+; RV32I-NEXT:    srli a5, a5, 1
+; RV32I-NEXT:    sll s0, t1, a0
+; RV32I-NEXT:    srli s1, t0, 1
+; RV32I-NEXT:    sll t0, t0, a0
+; RV32I-NEXT:    srli a7, a7, 1
+; RV32I-NEXT:    sll s2, a3, a0
+; RV32I-NEXT:    srli a3, t2, 1
 ; RV32I-NEXT:    sll t2, t2, a0
-; RV32I-NEXT:    srli s2, t1, 1
-; RV32I-NEXT:    sll t1, t1, a0
-; RV32I-NEXT:    srli t0, t0, 1
-; RV32I-NEXT:    sll s3, a3, a0
+; RV32I-NEXT:    srli t1, t1, 1
+; RV32I-NEXT:    sll s3, a4, a0
 ; RV32I-NEXT:    srl a0, t4, a1
-; RV32I-NEXT:    srl a3, t6, a1
-; RV32I-NEXT:    srl a4, a4, a1
+; RV32I-NEXT:    srl a4, t6, a1
+; RV32I-NEXT:    srl a5, a5, a1
 ; RV32I-NEXT:    srl t4, s1, a1
-; RV32I-NEXT:    srl a6, a6, a1
-; RV32I-NEXT:    srl t6, s2, a1
-; RV32I-NEXT:    srl t0, t0, a1
+; RV32I-NEXT:    srl a7, a7, a1
+; RV32I-NEXT:    srl t6, a3, a1
+; RV32I-NEXT:    srl t1, t1, a1
 ; RV32I-NEXT:    srli s1, s3, 24
-; RV32I-NEXT:    srli s2, s3, 16
-; RV32I-NEXT:    srli s4, s3, 8
+; RV32I-NEXT:    srli s4, s3, 16
+; RV32I-NEXT:    srli s5, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
-; RV32I-NEXT:    or a1, t5, a3
-; RV32I-NEXT:    or a3, a5, a4
+; RV32I-NEXT:    or a1, t5, a4
+; RV32I-NEXT:    or a3, a6, a5
 ; RV32I-NEXT:    or a4, s0, t4
-; RV32I-NEXT:    or a5, a7, a6
-; RV32I-NEXT:    or a6, t2, t6
-; RV32I-NEXT:    or a7, t1, t0
+; RV32I-NEXT:    or a5, t0, a7
+; RV32I-NEXT:    or a6, s2, t6
+; RV32I-NEXT:    or a7, t2, t1
 ; RV32I-NEXT:    sb s3, 0(a2)
-; RV32I-NEXT:    sb s4, 1(a2)
-; RV32I-NEXT:    sb s2, 2(a2)
+; RV32I-NEXT:    sb s5, 1(a2)
+; RV32I-NEXT:    sb s4, 2(a2)
 ; RV32I-NEXT:    sb s1, 3(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
@@ -2152,17 +2251,19 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-LABEL: ashr_32bytes:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -144
-; RV64I-NEXT:    sd s0, 136(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 128(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 120(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s3, 112(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s4, 104(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s5, 96(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s6, 88(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s7, 80(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s8, 72(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s9, 64(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi sp, sp, -160
+; RV64I-NEXT:    sd s0, 152(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s1, 144(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s2, 136(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s3, 128(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s4, 120(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s5, 112(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s6, 104(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s7, 96(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s8, 88(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s9, 80(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s10, 72(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s11, 64(sp) # 8-byte Folded Spill
 ; RV64I-NEXT:    lbu a3, 0(a0)
 ; RV64I-NEXT:    lbu a4, 1(a0)
 ; RV64I-NEXT:    lbu a5, 2(a0)
@@ -2179,123 +2280,144 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    lbu s1, 13(a0)
 ; RV64I-NEXT:    lbu s2, 14(a0)
 ; RV64I-NEXT:    lbu s3, 15(a0)
-; RV64I-NEXT:    slli a4, a4, 8
-; RV64I-NEXT:    slli a5, a5, 16
-; RV64I-NEXT:    slli a6, a6, 24
-; RV64I-NEXT:    or a3, a4, a3
-; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    lbu s4, 16(a0)
 ; RV64I-NEXT:    lbu s5, 17(a0)
 ; RV64I-NEXT:    lbu s6, 18(a0)
 ; RV64I-NEXT:    lbu s7, 19(a0)
+; RV64I-NEXT:    slli a4, a4, 8
+; RV64I-NEXT:    slli a5, a5, 16
+; RV64I-NEXT:    slli a6, a6, 24
 ; RV64I-NEXT:    slli t0, t0, 8
 ; RV64I-NEXT:    slli t1, t1, 16
 ; RV64I-NEXT:    slli t2, t2, 24
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    or a4, a6, a5
+; RV64I-NEXT:    or a5, t0, a7
+; RV64I-NEXT:    or a6, t2, t1
+; RV64I-NEXT:    lbu s8, 20(a0)
+; RV64I-NEXT:    lbu s9, 21(a0)
+; RV64I-NEXT:    lbu s10, 22(a0)
+; RV64I-NEXT:    lbu s11, 23(a0)
 ; RV64I-NEXT:    slli t4, t4, 8
 ; RV64I-NEXT:    slli t5, t5, 16
 ; RV64I-NEXT:    slli t6, t6, 24
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a6, t2, t1
-; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    lbu t5, 20(a0)
-; RV64I-NEXT:    lbu t6, 21(a0)
-; RV64I-NEXT:    lbu s8, 22(a0)
-; RV64I-NEXT:    lbu s9, 23(a0)
 ; RV64I-NEXT:    slli s1, s1, 8
 ; RV64I-NEXT:    slli s2, s2, 16
 ; RV64I-NEXT:    slli s3, s3, 24
+; RV64I-NEXT:    or a7, t4, t3
+; RV64I-NEXT:    or t0, t6, t5
+; RV64I-NEXT:    or t1, s1, s0
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    lbu t6, 24(a0)
+; RV64I-NEXT:    lbu s0, 25(a0)
+; RV64I-NEXT:    lbu s1, 26(a0)
+; RV64I-NEXT:    lbu s2, 27(a0)
 ; RV64I-NEXT:    slli s5, s5, 8
 ; RV64I-NEXT:    slli s6, s6, 16
 ; RV64I-NEXT:    slli s7, s7, 24
-; RV64I-NEXT:    or t1, s1, s0
-; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    slli s9, s9, 8
 ; RV64I-NEXT:    or t3, s5, s4
 ; RV64I-NEXT:    or t4, s7, s6
-; RV64I-NEXT:    lbu s0, 24(a0)
-; RV64I-NEXT:    lbu s1, 25(a0)
-; RV64I-NEXT:    lbu s2, 26(a0)
-; RV64I-NEXT:    lbu s3, 27(a0)
-; RV64I-NEXT:    slli t6, t6, 8
-; RV64I-NEXT:    slli s8, s8, 16
-; RV64I-NEXT:    slli s9, s9, 24
-; RV64I-NEXT:    slli s1, s1, 8
-; RV64I-NEXT:    or t5, t6, t5
-; RV64I-NEXT:    or t6, s9, s8
-; RV64I-NEXT:    or s0, s1, s0
-; RV64I-NEXT:    lbu s1, 28(a0)
+; RV64I-NEXT:    or t5, s9, s8
+; RV64I-NEXT:    lbu s3, 28(a0)
 ; RV64I-NEXT:    lbu s4, 29(a0)
 ; RV64I-NEXT:    lbu s5, 30(a0)
 ; RV64I-NEXT:    lbu s6, 31(a0)
-; RV64I-NEXT:    lbu a0, 0(a1)
-; RV64I-NEXT:    slli s2, s2, 16
-; RV64I-NEXT:    slli s3, s3, 24
-; RV64I-NEXT:    or a1, s3, s2
-; RV64I-NEXT:    mv s2, sp
+; RV64I-NEXT:    slli s10, s10, 16
+; RV64I-NEXT:    slli s11, s11, 24
+; RV64I-NEXT:    slli s0, s0, 8
+; RV64I-NEXT:    slli s1, s1, 16
+; RV64I-NEXT:    slli s2, s2, 24
 ; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or a0, s11, s10
+; RV64I-NEXT:    or t6, s0, t6
+; RV64I-NEXT:    or s0, s2, s1
+; RV64I-NEXT:    or s1, s4, s3
+; RV64I-NEXT:    lbu s2, 0(a1)
+; RV64I-NEXT:    lbu s3, 1(a1)
+; RV64I-NEXT:    lbu s4, 2(a1)
+; RV64I-NEXT:    lbu s7, 3(a1)
 ; RV64I-NEXT:    slli s5, s5, 16
 ; RV64I-NEXT:    slli s6, s6, 24
-; RV64I-NEXT:    or s1, s4, s1
-; RV64I-NEXT:    srli s3, a0, 3
-; RV64I-NEXT:    or s4, s6, s5
-; RV64I-NEXT:    andi s5, a0, 63
-; RV64I-NEXT:    andi s3, s3, 24
-; RV64I-NEXT:    xori s5, s5, 63
+; RV64I-NEXT:    slli s3, s3, 8
+; RV64I-NEXT:    slli s4, s4, 16
+; RV64I-NEXT:    slli s7, s7, 24
+; RV64I-NEXT:    or s5, s6, s5
+; RV64I-NEXT:    or s2, s3, s2
+; RV64I-NEXT:    or s3, s7, s4
+; RV64I-NEXT:    lbu s4, 5(a1)
+; RV64I-NEXT:    lbu s6, 4(a1)
+; RV64I-NEXT:    lbu s7, 6(a1)
+; RV64I-NEXT:    lbu a1, 7(a1)
+; RV64I-NEXT:    slli s4, s4, 8
+; RV64I-NEXT:    or s4, s4, s6
+; RV64I-NEXT:    slli s7, s7, 16
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    or a1, a1, s7
+; RV64I-NEXT:    mv s6, sp
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
 ; RV64I-NEXT:    or a5, t0, a7
 ; RV64I-NEXT:    or a6, t2, t1
 ; RV64I-NEXT:    or a7, t4, t3
-; RV64I-NEXT:    or t0, t6, t5
-; RV64I-NEXT:    or a1, a1, s0
-; RV64I-NEXT:    or t1, s4, s1
-; RV64I-NEXT:    add s2, s2, s3
+; RV64I-NEXT:    or a0, a0, t5
+; RV64I-NEXT:    or t0, s0, t6
+; RV64I-NEXT:    or t1, s5, s1
+; RV64I-NEXT:    or t2, s3, s2
+; RV64I-NEXT:    or a1, a1, s4
 ; RV64I-NEXT:    slli a4, a4, 32
 ; RV64I-NEXT:    slli a6, a6, 32
-; RV64I-NEXT:    slli t0, t0, 32
-; RV64I-NEXT:    slli t2, t1, 32
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    slli t3, t1, 32
+; RV64I-NEXT:    slli a1, a1, 32
 ; RV64I-NEXT:    sraiw t1, t1, 31
 ; RV64I-NEXT:    or a3, a4, a3
 ; RV64I-NEXT:    or a4, a6, a5
-; RV64I-NEXT:    or a5, t0, a7
-; RV64I-NEXT:    or a1, t2, a1
+; RV64I-NEXT:    or a0, a0, a7
+; RV64I-NEXT:    or a5, t3, t0
+; RV64I-NEXT:    or a1, a1, t2
 ; RV64I-NEXT:    sd t1, 32(sp)
 ; RV64I-NEXT:    sd t1, 40(sp)
 ; RV64I-NEXT:    sd t1, 48(sp)
 ; RV64I-NEXT:    sd t1, 56(sp)
 ; RV64I-NEXT:    sd a3, 0(sp)
 ; RV64I-NEXT:    sd a4, 8(sp)
-; RV64I-NEXT:    sd a5, 16(sp)
-; RV64I-NEXT:    sd a1, 24(sp)
-; RV64I-NEXT:    ld a1, 8(s2)
-; RV64I-NEXT:    ld a3, 16(s2)
-; RV64I-NEXT:    ld a4, 0(s2)
-; RV64I-NEXT:    ld a5, 24(s2)
-; RV64I-NEXT:    srl a6, a1, a0
-; RV64I-NEXT:    slli a7, a3, 1
-; RV64I-NEXT:    srl a4, a4, a0
-; RV64I-NEXT:    slli a1, a1, 1
-; RV64I-NEXT:    srl a3, a3, a0
+; RV64I-NEXT:    sd a0, 16(sp)
+; RV64I-NEXT:    sd a5, 24(sp)
+; RV64I-NEXT:    srli a0, a1, 3
+; RV64I-NEXT:    andi a3, a1, 63
+; RV64I-NEXT:    andi a0, a0, 24
+; RV64I-NEXT:    xori a3, a3, 63
+; RV64I-NEXT:    add a0, s6, a0
+; RV64I-NEXT:    ld a4, 8(a0)
+; RV64I-NEXT:    ld a5, 16(a0)
+; RV64I-NEXT:    ld a6, 0(a0)
+; RV64I-NEXT:    ld a0, 24(a0)
+; RV64I-NEXT:    srl a7, a4, a1
 ; RV64I-NEXT:    slli t0, a5, 1
-; RV64I-NEXT:    sra a5, a5, a0
-; RV64I-NEXT:    sll a0, a7, s5
-; RV64I-NEXT:    sll a1, a1, s5
-; RV64I-NEXT:    sll a7, t0, s5
-; RV64I-NEXT:    srli t0, a5, 56
-; RV64I-NEXT:    srli t1, a5, 48
-; RV64I-NEXT:    srli t2, a5, 40
-; RV64I-NEXT:    srli t3, a5, 32
-; RV64I-NEXT:    srli t4, a5, 24
-; RV64I-NEXT:    srli t5, a5, 16
-; RV64I-NEXT:    srli t6, a5, 8
-; RV64I-NEXT:    or a0, a6, a0
-; RV64I-NEXT:    or a1, a4, a1
-; RV64I-NEXT:    or a3, a3, a7
+; RV64I-NEXT:    srl a6, a6, a1
+; RV64I-NEXT:    slli a4, a4, 1
+; RV64I-NEXT:    srl a5, a5, a1
+; RV64I-NEXT:    slli t1, a0, 1
+; RV64I-NEXT:    sra t2, a0, a1
+; RV64I-NEXT:    sll a0, t0, a3
+; RV64I-NEXT:    sll a1, a4, a3
+; RV64I-NEXT:    sll a3, t1, a3
+; RV64I-NEXT:    srli a4, t2, 56
+; RV64I-NEXT:    srli t0, t2, 48
+; RV64I-NEXT:    srli t1, t2, 40
+; RV64I-NEXT:    srli t3, t2, 32
+; RV64I-NEXT:    srli t4, t2, 24
+; RV64I-NEXT:    srli t5, t2, 16
+; RV64I-NEXT:    srli t6, t2, 8
+; RV64I-NEXT:    or a0, a7, a0
+; RV64I-NEXT:    or a1, a6, a1
+; RV64I-NEXT:    or a3, a5, a3
 ; RV64I-NEXT:    sb t3, 28(a2)
-; RV64I-NEXT:    sb t2, 29(a2)
-; RV64I-NEXT:    sb t1, 30(a2)
-; RV64I-NEXT:    sb t0, 31(a2)
-; RV64I-NEXT:    sb a5, 24(a2)
+; RV64I-NEXT:    sb t1, 29(a2)
+; RV64I-NEXT:    sb t0, 30(a2)
+; RV64I-NEXT:    sb a4, 31(a2)
+; RV64I-NEXT:    sb t2, 24(a2)
 ; RV64I-NEXT:    sb t6, 25(a2)
 ; RV64I-NEXT:    sb t5, 26(a2)
 ; RV64I-NEXT:    sb t4, 27(a2)
@@ -2316,45 +2438,47 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV64I-NEXT:    srli s3, a0, 56
 ; RV64I-NEXT:    srli s4, a0, 48
 ; RV64I-NEXT:    srli s5, a0, 40
+; RV64I-NEXT:    srli s6, a0, 32
 ; RV64I-NEXT:    sb a7, 20(a2)
 ; RV64I-NEXT:    sb a6, 21(a2)
 ; RV64I-NEXT:    sb a5, 22(a2)
 ; RV64I-NEXT:    sb a4, 23(a2)
-; RV64I-NEXT:    srli a4, a0, 32
+; RV64I-NEXT:    srli a4, a0, 24
 ; RV64I-NEXT:    sb a3, 16(a2)
 ; RV64I-NEXT:    sb t2, 17(a2)
 ; RV64I-NEXT:    sb t1, 18(a2)
 ; RV64I-NEXT:    sb t0, 19(a2)
-; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    srli a3, a0, 16
 ; RV64I-NEXT:    sb t6, 4(a2)
 ; RV64I-NEXT:    sb t5, 5(a2)
 ; RV64I-NEXT:    sb t4, 6(a2)
 ; RV64I-NEXT:    sb t3, 7(a2)
-; RV64I-NEXT:    srli a5, a0, 16
+; RV64I-NEXT:    srli a5, a0, 8
 ; RV64I-NEXT:    sb a1, 0(a2)
 ; RV64I-NEXT:    sb s2, 1(a2)
 ; RV64I-NEXT:    sb s1, 2(a2)
 ; RV64I-NEXT:    sb s0, 3(a2)
-; RV64I-NEXT:    srli a1, a0, 8
-; RV64I-NEXT:    sb a4, 12(a2)
+; RV64I-NEXT:    sb s6, 12(a2)
 ; RV64I-NEXT:    sb s5, 13(a2)
 ; RV64I-NEXT:    sb s4, 14(a2)
 ; RV64I-NEXT:    sb s3, 15(a2)
 ; RV64I-NEXT:    sb a0, 8(a2)
-; RV64I-NEXT:    sb a1, 9(a2)
-; RV64I-NEXT:    sb a5, 10(a2)
-; RV64I-NEXT:    sb a3, 11(a2)
-; RV64I-NEXT:    ld s0, 136(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 128(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 120(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s3, 112(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s4, 104(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s5, 96(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s6, 88(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s7, 80(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s8, 72(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s9, 64(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 144
+; RV64I-NEXT:    sb a5, 9(a2)
+; RV64I-NEXT:    sb a3, 10(a2)
+; RV64I-NEXT:    sb a4, 11(a2)
+; RV64I-NEXT:    ld s0, 152(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s1, 144(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s2, 136(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s3, 128(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s4, 120(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s5, 112(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s6, 104(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s7, 96(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s8, 88(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s9, 80(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s10, 72(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s11, 64(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 160
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-LABEL: ashr_32bytes:
@@ -2379,148 +2503,159 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; RV32I-NEXT:    lbu a7, 3(a0)
 ; RV32I-NEXT:    lbu a5, 4(a0)
 ; RV32I-NEXT:    lbu t0, 5(a0)
-; RV32I-NEXT:    lbu t3, 6(a0)
-; RV32I-NEXT:    lbu t4, 7(a0)
-; RV32I-NEXT:    lbu t6, 8(a0)
-; RV32I-NEXT:    lbu s0, 9(a0)
-; RV32I-NEXT:    lbu s4, 10(a0)
-; RV32I-NEXT:    lbu s5, 11(a0)
-; RV32I-NEXT:    lbu s6, 12(a0)
-; RV32I-NEXT:    lbu s7, 13(a0)
-; RV32I-NEXT:    lbu s8, 14(a0)
-; RV32I-NEXT:    lbu s9, 15(a0)
-; RV32I-NEXT:    lbu s10, 16(a0)
-; RV32I-NEXT:    lbu s11, 17(a0)
-; RV32I-NEXT:    lbu s2, 18(a0)
-; RV32I-NEXT:    lbu s3, 19(a0)
+; RV32I-NEXT:    lbu t1, 6(a0)
+; RV32I-NEXT:    lbu t2, 7(a0)
+; RV32I-NEXT:    lbu t3, 8(a0)
+; RV32I-NEXT:    lbu t4, 9(a0)
+; RV32I-NEXT:    lbu t5, 10(a0)
+; RV32I-NEXT:    lbu t6, 11(a0)
+; RV32I-NEXT:    lbu s0, 12(a0)
+; RV32I-NEXT:    lbu s1, 13(a0)
+; RV32I-NEXT:    lbu s2, 14(a0)
+; RV32I-NEXT:    lbu s3, 15(a0)
+; RV32I-NEXT:    lbu s4, 16(a0)
+; RV32I-NEXT:    lbu s5, 17(a0)
+; RV32I-NEXT:    lbu s6, 18(a0)
+; RV32I-NEXT:    lbu s7, 19(a0)
 ; RV32I-NEXT:    slli a4, a4, 8
 ; RV32I-NEXT:    slli a6, a6, 16
 ; RV32I-NEXT:    slli a7, a7, 24
 ; RV32I-NEXT:    or a3, a4, a3
+; RV32I-NEXT:    sw a3, 4(sp) # 4-byte Folded Spill
 ; RV32I-NEXT:    or a4, a7, a6
-; RV32I-NEXT:    lbu t1, 20(a0)
-; RV32I-NEXT:    lbu t2, 21(a0)
-; RV32I-NEXT:    lbu t5, 22(a0)
-; RV32I-NEXT:    lbu s1, 23(a0)
+; RV32I-NEXT:    lbu s8, 20(a0)
+; RV32I-NEXT:    lbu s9, 21(a0)
+; RV32I-NEXT:    lbu s10, 22(a0)
+; RV32I-NEXT:    lbu s11, 23(a0)
 ; RV32I-NEXT:    slli t0, t0, 8
-; RV32I-NEXT:    slli t3, t3, 16
-; RV32I-NEXT:    slli t4, t4, 24
-; RV32I-NEXT:    slli s0, s0, 8
-; RV32I-NEXT:    slli s4, s4, 16
-; RV32I-NEXT:    slli s5, s5, 24
+; RV32I-NEXT:    slli t1, t1, 16
+; RV32I-NEXT:    slli t2, t2, 24
+; RV32I-NEXT:    slli t4, t4, 8
+; RV32I-NEXT:    slli t5, t5, 16
+; RV32I-NEXT:    slli t6, t6, 24
 ; RV32I-NEXT:    or a5, t0, a5
-; RV32I-NEXT:    or a6, t4, t3
-; RV32I-NEXT:    or a7, s0, t6
-; RV32I-NEXT:    or t0, s5, s4
-; RV32I-NEXT:    lbu t3, 24(a0)
-; RV32I-NEXT:    lbu s4, 25(a0)
-; RV32I-NEXT:    lbu s5, 26(a0)
-; RV32I-NEXT:    lbu ra, 27(a0)
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    slli s11, s11, 8
-; RV32I-NEXT:    or t4, s7, s6
-; RV32I-NEXT:    or t6, s9, s8
-; RV32I-NEXT:    or s0, s11, s10
-; RV32I-NEXT:    lbu s6, 28(a0)
-; RV32I-NEXT:    lbu s7, 29(a0)
-; RV32I-NEXT:    lbu s8, 30(a0)
-; RV32I-NEXT:    lbu s9, 31(a0)
-; RV32I-NEXT:    lbu a0, 0(a1)
+; RV32I-NEXT:    or a6, t2, t1
+; RV32I-NEXT:    or a7, t4, t3
+; RV32I-NEXT:    or t0, t6, t5
+; RV32I-NEXT:    lbu ra, 24(a0)
+; RV32I-NEXT:    lbu a3, 25(a0)
+; RV32I-NEXT:    lbu t4, 26(a0)
+; RV32I-NEXT:    lbu t5, 27(a0)
+; RV32I-NEXT:    slli s1, s1, 8
 ; RV32I-NEXT:    slli s2, s2, 16
 ; RV32I-NEXT:    slli s3, s3, 24
-; RV32I-NEXT:    or s2, s3, s2
-; RV32I-NEXT:    addi s3, sp, 8
-; RV32I-NEXT:    slli t2, t2, 8
-; RV32I-NEXT:    slli t5, t5, 16
-; RV32I-NEXT:    slli s1, s1, 24
-; RV32I-NEXT:    slli s4, s4, 8
-; RV32I-NEXT:    slli s5, s5, 16
-; RV32I-NEXT:    slli ra, ra, 24
-; RV32I-NEXT:    slli s7, s7, 8
-; RV32I-NEXT:    slli s8, s8, 16
-; RV32I-NEXT:    slli s9, s9, 24
-; RV32I-NEXT:    or t1, t2, t1
-; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    slli s5, s5, 8
+; RV32I-NEXT:    or t1, s1, s0
+; RV32I-NEXT:    or t2, s3, s2
+; RV32I-NEXT:    or t3, s5, s4
+; RV32I-NEXT:    lbu t6, 28(a0)
+; RV32I-NEXT:    lbu s0, 29(a0)
+; RV32I-NEXT:    lbu s1, 30(a0)
+; RV32I-NEXT:    lbu a0, 31(a0)
+; RV32I-NEXT:    slli s6, s6, 16
+; RV32I-NEXT:    slli s7, s7, 24
+; RV32I-NEXT:    slli s9, s9, 8
+; RV32I-NEXT:    slli s10, s10, 16
+; RV32I-NEXT:    slli s11, s11, 24
+; RV32I-NEXT:    or s2, s7, s6
+; RV32I-NEXT:    or s3, s9, s8
+; RV32I-NEXT:    or s4, s11, s10
+; RV32I-NEXT:    lbu s5, 0(a1)
+; RV32I-NEXT:    lbu s6, 1(a1)
+; RV32I-NEXT:    lbu s7, 2(a1)
+; RV32I-NEXT:    lbu a1, 3(a1)
+; RV32I-NEXT:    slli a3, a3, 8
+; RV32I-NEXT:    or a3, a3, ra
+; RV32I-NEXT:    addi s8, sp, 8
+; RV32I-NEXT:    slli t4, t4, 16
+; RV32I-NEXT:    slli t5, t5, 24
+; RV32I-NEXT:    slli s0, s0, 8
+; RV32I-NEXT:    slli s1, s1, 16
+; RV32I-NEXT:    slli a0, a0, 24
+; RV32I-NEXT:    slli s6, s6, 8
+; RV32I-NEXT:    slli s7, s7, 16
+; RV32I-NEXT:    slli a1, a1, 24
+; RV32I-NEXT:    or t4, t5, t4
+; RV32I-NEXT:    or t5, s0, t6
+; RV32I-NEXT:    or s1, a0, s1
+; RV32I-NEXT:    or t6, s6, s5
+; RV32I-NEXT:    or a1, a1, s7
+; RV32I-NEXT:    srai s0, a0, 31
+; RV32I-NEXT:    lw a0, 4(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    or a4, a4, a0
+; RV32I-NEXT:    or a5, a6, a5
+; RV32I-NEXT:    or a6, t0, a7
+; RV32I-NEXT:    or a7, t2, t1
+; RV32I-NEXT:    or t0, s2, t3
+; RV32I-NEXT:    or t1, s4, s3
+; RV32I-NEXT:    or a3, t4, a3
 ; RV32I-NEXT:    or t2, s1, t5
-; RV32I-NEXT:    andi t5, a0, 31
-; RV32I-NEXT:    or t3, s4, t3
-; RV32I-NEXT:    or s1, ra, s5
-; RV32I-NEXT:    or s4, s7, s6
-; RV32I-NEXT:    or s5, s9, s8
-; RV32I-NEXT:    srai s6, s9, 31
-; RV32I-NEXT:    andi s7, a1, 28
-; RV32I-NEXT:    xori a1, t5, 31
-; RV32I-NEXT:    or a3, a4, a3
-; RV32I-NEXT:    or a4, a6, a5
-; RV32I-NEXT:    or a5, t0, a7
-; RV32I-NEXT:    or a6, t6, t4
-; RV32I-NEXT:    or a7, s2, s0
-; RV32I-NEXT:    or t0, t2, t1
-; RV32I-NEXT:    or t1, s1, t3
-; RV32I-NEXT:    or t2, s5, s4
-; RV32I-NEXT:    sw s6, 56(sp)
-; RV32I-NEXT:    sw s6, 60(sp)
-; RV32I-NEXT:    sw s6, 64(sp)
-; RV32I-NEXT:    sw s6, 68(sp)
-; RV32I-NEXT:    sw s6, 40(sp)
-; RV32I-NEXT:    sw s6, 44(sp)
-; RV32I-NEXT:    sw s6, 48(sp)
-; RV32I-NEXT:    sw s6, 52(sp)
-; RV32I-NEXT:    add s3, s3, s7
-; RV32I-NEXT:    sw a7, 24(sp)
-; RV32I-NEXT:    sw t0, 28(sp)
-; RV32I-NEXT:    sw t1, 32(sp)
+; RV32I-NEXT:    or a0, a1, t6
+; RV32I-NEXT:    sw s0, 56(sp)
+; RV32I-NEXT:    sw s0, 60(sp)
+; RV32I-NEXT:    sw s0, 64(sp)
+; RV32I-NEXT:    sw s0, 68(sp)
+; RV32I-NEXT:    sw s0, 40(sp)
+; RV32I-NEXT:    sw s0, 44(sp)
+; RV32I-NEXT:    sw s0, 48(sp)
+; RV32I-NEXT:    sw s0, 52(sp)
+; RV32I-NEXT:    sw t0, 24(sp)
+; RV32I-NEXT:    sw t1, 28(sp)
+; RV32I-NEXT:    sw a3, 32(sp)
 ; RV32I-NEXT:    sw t2, 36(sp)
-; RV32I-NEXT:    sw a3, 8(sp)
-; RV32I-NEXT:    sw a4, 12(sp)
-; RV32I-NEXT:    sw a5, 16(sp)
-; RV32I-NEXT:    sw a6, 20(sp)
-; RV32I-NEXT:    lw a3, 0(s3)
-; RV32I-NEXT:    lw a4, 4(s3)
-; RV32I-NEXT:    lw a5, 8(s3)
-; RV32I-NEXT:    lw a6, 12(s3)
-; RV32I-NEXT:    lw a7, 16(s3)
-; RV32I-NEXT:    lw t0, 20(s3)
-; RV32I-NEXT:    lw t1, 24(s3)
-; RV32I-NEXT:    lw t2, 28(s3)
-; RV32I-NEXT:    srl t3, a4, a0
-; RV32I-NEXT:    slli t4, a5, 1
+; RV32I-NEXT:    sw a4, 8(sp)
+; RV32I-NEXT:    sw a5, 12(sp)
+; RV32I-NEXT:    sw a6, 16(sp)
+; RV32I-NEXT:    sw a7, 20(sp)
+; RV32I-NEXT:    srli a1, a0, 3
+; RV32I-NEXT:    andi a3, a0, 31
+; RV32I-NEXT:    andi a4, a1, 28
+; RV32I-NEXT:    xori a1, a3, 31
+; RV32I-NEXT:    add a4, s8, a4
+; RV32I-NEXT:    lw a3, 0(a4)
+; RV32I-NEXT:    lw a5, 4(a4)
+; RV32I-NEXT:    lw a6, 8(a4)
+; RV32I-NEXT:    lw a7, 12(a4)
+; RV32I-NEXT:    lw t0, 16(a4)
+; RV32I-NEXT:    lw t1, 20(a4)
+; RV32I-NEXT:    lw t2, 24(a4)
+; RV32I-NEXT:    lw a4, 28(a4)
+; RV32I-NEXT:    srl t3, a5, a0
+; RV32I-NEXT:    slli t4, a6, 1
 ; RV32I-NEXT:    srl a3, a3, a0
-; RV32I-NEXT:    slli a4, a4, 1
-; RV32I-NEXT:    srl t5, a6, a0
-; RV32I-NEXT:    slli t6, a7, 1
-; RV32I-NEXT:    srl a5, a5, a0
-; RV32I-NEXT:    slli a6, a6, 1
-; RV32I-NEXT:    srl s0, t0, a0
-; RV32I-NEXT:    slli s1, t1, 1
-; RV32I-NEXT:    srl a7, a7, a0
-; RV32I-NEXT:    slli t0, t0, 1
-; RV32I-NEXT:    srl t1, t1, a0
-; RV32I-NEXT:    slli s2, t2, 1
-; RV32I-NEXT:    sra t2, t2, a0
+; RV32I-NEXT:    slli a5, a5, 1
+; RV32I-NEXT:    srl t5, a7, a0
+; RV32I-NEXT:    slli t6, t0, 1
+; RV32I-NEXT:    srl a6, a6, a0
+; RV32I-NEXT:    slli a7, a7, 1
+; RV32I-NEXT:    srl s0, t1, a0
+; RV32I-NEXT:    slli s1, t2, 1
+; RV32I-NEXT:    srl t0, t0, a0
+; RV32I-NEXT:    slli t1, t1, 1
+; RV32I-NEXT:    srl t2, t2, a0
+; RV32I-NEXT:    slli s2, a4, 1
+; RV32I-NEXT:    sra s3, a4, a0
 ; RV32I-NEXT:    sll a0, t4, a1
-; RV32I-NEXT:    sll a4, a4, a1
-; RV32I-NEXT:    sll t4, t6, a1
-; RV32I-NEXT:    sll a6, a6, a1
-; RV32I-NEXT:    sll t6, s1, a1
-; RV32I-NEXT:    sll t0, t0, a1
-; RV32I-NEXT:    sll s1, s2, a1
-; RV32I-NEXT:    srli s2, t2, 24
-; RV32I-NEXT:    srli s3, t2, 16
-; RV32I-NEXT:    srli s4, t2, 8
+; RV32I-NEXT:    sll a4, a5, a1
+; RV32I-NEXT:    sll a5, t6, a1
+; RV32I-NEXT:    sll a7, a7, a1
+; RV32I-NEXT:    sll t4, s1, a1
+; RV32I-NEXT:    sll t1, t1, a1
+; RV32I-NEXT:    sll t6, s2, a1
+; RV32I-NEXT:    srli s1, s3, 24
+; RV32I-NEXT:    srli s2, s3, 16
+; RV32I-NEXT:    srli s4, s3, 8
 ; RV32I-NEXT:    or a0, t3, a0
 ; RV32I-NEXT:    or a1, a3, a4
-; RV32I-NEXT:    or a3, t5, t4
-; RV32I-NEXT:    or a4, a5, a6
-; RV32I-NEXT:    or a5, s0, t6
-; RV32I-NEXT:    or a6, a7, t0
-; RV32I-NEXT:    or a7, t1, s1
-; RV32I-NEXT:    sb t2, 28(a2)
+; RV32I-NEXT:    or a3, t5, a5
+; RV32I-NEXT:    or a4, a6, a7
+; RV32I-NEXT:    or a5, s0, t4
+; RV32I-NEXT:    or a6, t0, t1
+; RV32I-NEXT:    or a7, t2, t6
+; RV32I-NEXT:    sb s3, 28(a2)
 ; RV32I-NEXT:    sb s4, 29(a2)
-; RV32I-NEXT:    sb s3, 30(a2)
-; RV32I-NEXT:    sb s2, 31(a2)
+; RV32I-NEXT:    sb s2, 30(a2)
+; RV32I-NEXT:    sb s1, 31(a2)
 ; RV32I-NEXT:    srli t0, a7, 24
 ; RV32I-NEXT:    srli t1, a7, 16
 ; RV32I-NEXT:    srli t2, a7, 8
diff --git a/llvm/test/CodeGen/RISCV/xqciac.ll b/llvm/test/CodeGen/RISCV/xqciac.ll
index a3b4e7829a51..6fdc63fddbc3 100644
--- a/llvm/test/CodeGen/RISCV/xqciac.ll
+++ b/llvm/test/CodeGen/RISCV/xqciac.ll
@@ -231,12 +231,12 @@ define dso_local i32 @pow2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: pow2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: pow2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %mul = mul nsw i32 %b, 32
@@ -276,12 +276,12 @@ define dso_local i32 @shladd(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladd:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladd:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shl = shl nsw i32 %b, 31
@@ -305,9 +305,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladd64:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    srli a4, a2, 1
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a2, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a2, a0, 31
 ; RV32IMXQCIAC-NEXT:    slli a2, a2, 31
-; RV32IMXQCIAC-NEXT:    qc.shladd a3, a4, a3, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a3, a3, a4, 31
 ; RV32IMXQCIAC-NEXT:    sltu a2, a0, a2
 ; RV32IMXQCIAC-NEXT:    add a1, a1, a3
 ; RV32IMXQCIAC-NEXT:    add a1, a1, a2
@@ -316,9 +316,9 @@ define dso_local i64 @shladd64(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IZBAMXQCIAC-LABEL: shladd64:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    srli a4, a2, 1
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a2, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a2, a0, 31
 ; RV32IZBAMXQCIAC-NEXT:    slli a2, a2, 31
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a3, a4, a3, 31
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a3, a3, a4, 31
 ; RV32IZBAMXQCIAC-NEXT:    sltu a2, a0, a2
 ; RV32IZBAMXQCIAC-NEXT:    add a1, a1, a3
 ; RV32IZBAMXQCIAC-NEXT:    add a1, a1, a2
@@ -338,12 +338,12 @@ define dso_local i32 @shladd_ordisjoint(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladd_ordisjoint:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 22
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 22
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladd_ordisjoint:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 22
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 22
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shl = shl nsw i32 %b, 22
@@ -361,13 +361,13 @@ define dso_local i32 @shladdc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ;
 ; RV32IMXQCIAC-LABEL: shladdc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
 ; RV32IMXQCIAC-NEXT:    slli a0, a0, 26
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 5
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 5
 ; RV32IZBAMXQCIAC-NEXT:    slli a0, a0, 26
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
@@ -388,7 +388,7 @@ define dso_local i32 @shxaddc1c2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shxaddc1c2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    slli a1, a1, 28
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 31
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 31
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shxaddc1c2:
@@ -417,18 +417,18 @@ define dso_local i64 @shladdc1c264(i64 %a, i64 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladdc1c264:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    srli a1, a2, 12
-; RV32IMXQCIAC-NEXT:    qc.shladd a1, a1, a3, 20
+; RV32IMXQCIAC-NEXT:    qc.shladd a1, a3, a1, 20
 ; RV32IMXQCIAC-NEXT:    slli a2, a2, 20
-; RV32IMXQCIAC-NEXT:    qc.shladd a1, a1, a0, 23
+; RV32IMXQCIAC-NEXT:    qc.shladd a1, a0, a1, 23
 ; RV32IMXQCIAC-NEXT:    mv a0, a2
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1c264:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    srli a1, a2, 12
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a1, a3, 20
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a3, a1, 20
 ; RV32IZBAMXQCIAC-NEXT:    slli a2, a2, 20
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a1, a0, 23
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a1, a0, a1, 23
 ; RV32IZBAMXQCIAC-NEXT:    mv a0, a2
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
@@ -449,13 +449,13 @@ define dso_local i32 @shladdc1equalc2(i32 %a, i32 %b) local_unnamed_addr #0 {
 ; RV32IMXQCIAC-LABEL: shladdc1equalc2:
 ; RV32IMXQCIAC:       # %bb.0: # %entry
 ; RV32IMXQCIAC-NEXT:    slli a1, a1, 12
-; RV32IMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 12
+; RV32IMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 12
 ; RV32IMXQCIAC-NEXT:    ret
 ;
 ; RV32IZBAMXQCIAC-LABEL: shladdc1equalc2:
 ; RV32IZBAMXQCIAC:       # %bb.0: # %entry
 ; RV32IZBAMXQCIAC-NEXT:    slli a1, a1, 12
-; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a1, a0, 12
+; RV32IZBAMXQCIAC-NEXT:    qc.shladd a0, a0, a1, 12
 ; RV32IZBAMXQCIAC-NEXT:    ret
 entry:
   %shlc1 = shl nsw i32 %a, 12
@@ -463,3 +463,30 @@ entry:
   %add = add nsw i32 %shlc1, %shlc2
   ret i32 %add
 }
+
+define i32 @testmuliaddnegimm(i32 %a) {
+; RV32IM-LABEL: testmuliaddnegimm:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    slli a1, a0, 1
+; RV32IM-NEXT:    add a0, a1, a0
+; RV32IM-NEXT:    li a1, 3
+; RV32IM-NEXT:    sub a0, a1, a0
+; RV32IM-NEXT:    ret
+;
+; RV32IMXQCIAC-LABEL: testmuliaddnegimm:
+; RV32IMXQCIAC:       # %bb.0:
+; RV32IMXQCIAC-NEXT:    li a1, 3
+; RV32IMXQCIAC-NEXT:    qc.muliadd a1, a0, -3
+; RV32IMXQCIAC-NEXT:    mv a0, a1
+; RV32IMXQCIAC-NEXT:    ret
+;
+; RV32IZBAMXQCIAC-LABEL: testmuliaddnegimm:
+; RV32IZBAMXQCIAC:       # %bb.0:
+; RV32IZBAMXQCIAC-NEXT:    li a1, 3
+; RV32IZBAMXQCIAC-NEXT:    qc.muliadd a1, a0, -3
+; RV32IZBAMXQCIAC-NEXT:    mv a0, a1
+; RV32IZBAMXQCIAC-NEXT:    ret
+  %mul = mul i32 %a, -3
+  %add = add i32 %mul, 3
+  ret i32 %add
+}
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
index f227fa9aa423..2fa06517508c 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
@@ -105,6 +105,7 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ;
 ; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
 ; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
 ; RV32ZBBXQCIBM-NEXT:    qc.insbi a0, -1, 1, 16
 ; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
 ; RV32ZBBXQCIBM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
index 6b7f9ae85662..88054a691bad 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
@@ -47,6 +47,29 @@ define i32 @test_insbi_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_mask_mv(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: test_insbi_mask_mv:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 16
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    mv a0, a1
+; RV32IXQCIBM-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    mv a0, a1
+; RV32IXQCIBMZBS-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %b, 65535
+  ret i32 %or
+}
+
 define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
 ; RV32I-LABEL: test_insbi_shifted_mask:
 ; RV32I:       # %bb.0:
@@ -67,6 +90,36 @@ define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_shifted_mask_multiple_uses(i32 %a) nounwind {
+; RV32I-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 15
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    addi a0, a0, 10
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    lui a1, 15
+; RV32IXQCIBM-NEXT:    or a1, a1, a0
+; RV32IXQCIBM-NEXT:    addi a0, a0, 10
+; RV32IXQCIBM-NEXT:    xor a0, a0, a1
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    lui a1, 15
+; RV32IXQCIBMZBS-NEXT:    or a1, a1, a0
+; RV32IXQCIBMZBS-NEXT:    addi a0, a0, 10
+; RV32IXQCIBMZBS-NEXT:    xor a0, a0, a1
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %a, 61440
+  %add = add i32 %a, 10
+  %xor = xor i32 %or, %add
+  ret i32 %xor
+}
+
 define i32 @test_single_bit_set(i32 %a) nounwind {
 ; RV32I-LABEL: test_single_bit_set:
 ; RV32I:       # %bb.0:
diff --git a/llvm/test/CodeGen/SPARC/float-ua2007.ll b/llvm/test/CodeGen/SPARC/float-ua2007.ll
new file mode 100644
index 000000000000..252b47943fe4
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/float-ua2007.ll
@@ -0,0 +1,275 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc64 --fp-contract=fast -mattr=-ua2007 < %s | FileCheck %s -check-prefix=NO-UA2007
+; RUN: llc -mtriple=sparc64 --fp-contract=fast -mattr=+ua2007 < %s | FileCheck %s -check-prefix=UA2007
+
+define float @fmadds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fmadds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+;
+; UA2007-LABEL: fmadds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmadds %f1, %f3, %f5, %f0
+  %ret = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  ret float %ret
+}
+
+define double @fmaddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fmaddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+;
+; UA2007-LABEL: fmaddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmaddd %f0, %f2, %f4, %f0
+  %ret = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  ret double %ret
+}
+
+define float @fmsubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fmsubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+;
+; UA2007-LABEL: fmsubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubs %f1, %f3, %f5, %f0
+  %neg = fneg float %c
+  %ret = call float @llvm.fmuladd.f32(float %a, float %b, float %neg)
+  ret float %ret
+}
+
+define double @fmsubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fmsubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+;
+; UA2007-LABEL: fmsubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubd %f0, %f2, %f4, %f0
+  %neg = fneg double %c
+  %ret = call double @llvm.fmuladd.f64(double %a, double %b, double %neg)
+  ret double %ret
+}
+
+define float @fnmadds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fnmadds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: fnmadds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmadds %f1, %f3, %f5, %f0
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %ret = fneg float %fma
+  ret float %ret
+}
+
+define double @fnmaddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fnmaddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: fnmaddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmaddd %f0, %f2, %f4, %f0
+  %fma = call double @llvm.fmuladd.f64(double %a, double %b, double %c)
+  %ret = fneg double %fma
+  ret double %ret
+}
+
+define float @fnmsubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: fnmsubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: fnmsubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubs %f1, %f3, %f5, %f0
+  %neg = fneg float %c
+  %fma = call float @llvm.fmuladd.f32(float %a, float %b, float %neg)
+  %ret = fneg float %fma
+  ret float %ret
+}
+
+define double @fnmsubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: fnmsubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: fnmsubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubd %f0, %f2, %f4, %f0
+  %neg = fneg double %c
+  %fma = call double @llvm.fmuladd.f64(double %a, double %b, double %neg)
+  %ret = fneg double %fma
+  ret double %ret
+}
+
+
+define float @combine_madds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_madds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+;
+; UA2007-LABEL: combine_madds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmadds %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %add = fadd float %mul, %c
+  ret float %add
+}
+
+define double @combine_maddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_maddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+;
+; UA2007-LABEL: combine_maddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmaddd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %add = fadd double %mul, %c
+  ret double %add
+}
+
+define float @combine_msubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_msubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+;
+; UA2007-LABEL: combine_msubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubs %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  ret float %sub
+}
+
+define double @combine_msubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_msubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+;
+; UA2007-LABEL: combine_msubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fmsubd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  ret double %sub
+}
+
+define float @combine_nmadds(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_nmadds:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fadds %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: combine_nmadds:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmadds %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %add = fadd float %mul, %c
+  %neg = fneg float %add
+  ret float %neg
+}
+
+define double @combine_nmaddd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_nmaddd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    faddd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: combine_nmaddd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmaddd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %add = fadd double %mul, %c
+  %neg = fneg double %add
+  ret double %neg
+}
+
+define float @combine_nmsubs(float %a, float %b, float %c) nounwind {
+; NO-UA2007-LABEL: combine_nmsubs:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuls %f1, %f3, %f0
+; NO-UA2007-NEXT:    fsubs %f0, %f5, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegs %f0, %f0
+;
+; UA2007-LABEL: combine_nmsubs:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubs %f1, %f3, %f5, %f0
+  %mul = fmul float %a, %b
+  %sub = fsub float %mul, %c
+  %neg = fneg float %sub
+  ret float %neg
+}
+
+define double @combine_nmsubd(double %a, double %b, double %c) nounwind {
+; NO-UA2007-LABEL: combine_nmsubd:
+; NO-UA2007:       ! %bb.0:
+; NO-UA2007-NEXT:    fmuld %f0, %f2, %f0
+; NO-UA2007-NEXT:    fsubd %f0, %f4, %f0
+; NO-UA2007-NEXT:    retl
+; NO-UA2007-NEXT:    fnegd %f0, %f0
+;
+; UA2007-LABEL: combine_nmsubd:
+; UA2007:       ! %bb.0:
+; UA2007-NEXT:    retl
+; UA2007-NEXT:    fnmsubd %f0, %f2, %f4, %f0
+  %mul = fmul double %a, %b
+  %sub = fsub double %mul, %c
+  %neg = fneg double %sub
+  ret double %neg
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare double @llvm.fmuladd.f64(double, double, double)
diff --git a/llvm/test/CodeGen/SPARC/tls-sp.ll b/llvm/test/CodeGen/SPARC/tls-sp.ll
new file mode 100644
index 000000000000..de9af01398d2
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/tls-sp.ll
@@ -0,0 +1,105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=sparc -relocation-model=pic < %s | FileCheck --check-prefix=SPARC %s
+; RUN: llc -mtriple=sparc64 -relocation-model=pic < %s | FileCheck --check-prefix=SPARC64 %s
+
+@x = external thread_local global i8
+
+;; Test that we don't over-allocate stack space when calling __tls_get_addr
+;; with the call frame pseudos able to be eliminated.
+define ptr @no_alloca() nounwind {
+; SPARC-LABEL: no_alloca:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:  .Ltmp0:
+; SPARC-NEXT:    call .Ltmp1
+; SPARC-NEXT:  .Ltmp2:
+; SPARC-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC-NEXT:  .Ltmp1:
+; SPARC-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC-NEXT:    add %i0, %o7, %i0
+; SPARC-NEXT:    sethi %tgd_hi22(x), %i1
+; SPARC-NEXT:    add %i1, %tgd_lo10(x), %i1
+; SPARC-NEXT:    add %i0, %i1, %o0, %tgd_add(x)
+; SPARC-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    ret
+; SPARC-NEXT:    restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: no_alloca:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:  .Ltmp0:
+; SPARC64-NEXT:    rd %pc, %o7
+; SPARC64-NEXT:  .Ltmp2:
+; SPARC64-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp2-.Ltmp0)), %i0
+; SPARC64-NEXT:  .Ltmp1:
+; SPARC64-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.Ltmp0)), %i0
+; SPARC64-NEXT:    add %i0, %o7, %i0
+; SPARC64-NEXT:    sethi %tgd_hi22(x), %i1
+; SPARC64-NEXT:    add %i1, %tgd_lo10(x), %i1
+; SPARC64-NEXT:    add %i0, %i1, %o0, %tgd_add(x)
+; SPARC64-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore %g0, %o0, %o0
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  ret ptr %0
+}
+
+;; Test that %sp is valid for the call to __tls_get_addr. We store to a dynamic
+;; alloca in order to prevent eliminating any call frame pseudos from the call.
+define ptr @dynamic_alloca(i64 %n) nounwind {
+; SPARC-LABEL: dynamic_alloca:
+; SPARC:       ! %bb.0: ! %entry
+; SPARC-NEXT:    save %sp, -96, %sp
+; SPARC-NEXT:  .Ltmp3:
+; SPARC-NEXT:    call .Ltmp4
+; SPARC-NEXT:  .Ltmp5:
+; SPARC-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i0
+; SPARC-NEXT:  .Ltmp4:
+; SPARC-NEXT:    or %i0, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i0
+; SPARC-NEXT:    add %i0, %o7, %i0
+; SPARC-NEXT:    sethi %tgd_hi22(x), %i2
+; SPARC-NEXT:    add %i2, %tgd_lo10(x), %i2
+; SPARC-NEXT:    add %i0, %i2, %o0, %tgd_add(x)
+; SPARC-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC-NEXT:    nop
+; SPARC-NEXT:    add %i1, 7, %i0
+; SPARC-NEXT:    and %i0, -8, %i0
+; SPARC-NEXT:    sub %sp, %i0, %i0
+; SPARC-NEXT:    add %i0, -8, %sp
+; SPARC-NEXT:    mov 1, %i1
+; SPARC-NEXT:    stb %i1, [%i0+88]
+; SPARC-NEXT:    ret
+; SPARC-NEXT:    restore %g0, %o0, %o0
+;
+; SPARC64-LABEL: dynamic_alloca:
+; SPARC64:       ! %bb.0: ! %entry
+; SPARC64-NEXT:    save %sp, -128, %sp
+; SPARC64-NEXT:  .Ltmp3:
+; SPARC64-NEXT:    rd %pc, %o7
+; SPARC64-NEXT:  .Ltmp5:
+; SPARC64-NEXT:    sethi %hi(_GLOBAL_OFFSET_TABLE_+(.Ltmp5-.Ltmp3)), %i1
+; SPARC64-NEXT:  .Ltmp4:
+; SPARC64-NEXT:    or %i1, %lo(_GLOBAL_OFFSET_TABLE_+(.Ltmp4-.Ltmp3)), %i1
+; SPARC64-NEXT:    add %i1, %o7, %i1
+; SPARC64-NEXT:    sethi %tgd_hi22(x), %i2
+; SPARC64-NEXT:    add %i2, %tgd_lo10(x), %i2
+; SPARC64-NEXT:    add %i1, %i2, %o0, %tgd_add(x)
+; SPARC64-NEXT:    call __tls_get_addr, %tgd_call(x)
+; SPARC64-NEXT:    nop
+; SPARC64-NEXT:    add %i0, 15, %i0
+; SPARC64-NEXT:    and %i0, -16, %i0
+; SPARC64-NEXT:    sub %sp, %i0, %i0
+; SPARC64-NEXT:    mov %i0, %sp
+; SPARC64-NEXT:    mov 1, %i1
+; SPARC64-NEXT:    stb %i1, [%i0+2175]
+; SPARC64-NEXT:    ret
+; SPARC64-NEXT:    restore %g0, %o0, %o0
+entry:
+  %0 = call ptr @llvm.threadlocal.address.p0(ptr @x)
+  %1 = alloca i8, i64 %n
+  store i8 1, ptr %1
+  ret ptr %0
+}
diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll
index bbf4d50bd716..8a6a30318ae5 100644
--- a/llvm/test/CodeGen/SystemZ/pr60413.ll
+++ b/llvm/test/CodeGen/SystemZ/pr60413.ll
@@ -16,31 +16,31 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    stmg %r13, %r15, 104(%r15)
 ; CHECK-NEXT:    aghi %r15, -168
 ; CHECK-NEXT:    lhrl %r1, f+4
+; CHECK-NEXT:    sll %r1, 8
 ; CHECK-NEXT:    larl %r2, f
-; CHECK-NEXT:    llc %r2, 6(%r2)
-; CHECK-NEXT:    larl %r3, e
-; CHECK-NEXT:    lb %r0, 3(%r3)
-; CHECK-NEXT:    rosbg %r2, %r1, 32, 55, 8
-; CHECK-NEXT:    vlvgp %v0, %r2, %r0
-; CHECK-NEXT:    vlvgf %v0, %r2, 0
-; CHECK-NEXT:    vlvgf %v0, %r2, 2
-; CHECK-NEXT:    vlvgp %v1, %r0, %r2
-; CHECK-NEXT:    vlvgp %v2, %r2, %r2
-; CHECK-NEXT:    lr %r1, %r2
+; CHECK-NEXT:    ic %r1, 6(%r2)
+; CHECK-NEXT:    larl %r2, e
+; CHECK-NEXT:    lb %r0, 3(%r2)
+; CHECK-NEXT:    vlvgp %v0, %r0, %r1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlvgf %v1, %r1, 0
+; CHECK-NEXT:    vlvgf %v1, %r1, 2
+; CHECK-NEXT:    vlvgp %v2, %r1, %r1
+; CHECK-NEXT:    # kill: def $r1l killed $r1l killed $r1d
 ; CHECK-NEXT:    nilh %r1, 255
 ; CHECK-NEXT:    chi %r1, 128
 ; CHECK-NEXT:    ipm %r1
 ; CHECK-NEXT:    risbg %r1, %r1, 63, 191, 36
+; CHECK-NEXT:    vlvgf %v0, %r0, 0
+; CHECK-NEXT:    vlvgf %v0, %r0, 2
 ; CHECK-NEXT:    vgbm %v3, 30583
 ; CHECK-NEXT:    vn %v0, %v0, %v3
-; CHECK-NEXT:    vlvgf %v1, %r0, 0
-; CHECK-NEXT:    vlvgf %v1, %r0, 2
 ; CHECK-NEXT:    vn %v1, %v1, %v3
 ; CHECK-NEXT:    vrepf %v2, %v2, 1
 ; CHECK-NEXT:    vn %v2, %v2, %v3
 ; CHECK-NEXT:    vrepif %v3, 127
-; CHECK-NEXT:    vchlf %v0, %v0, %v3
-; CHECK-NEXT:    vlgvf %r13, %v0, 0
+; CHECK-NEXT:    vchlf %v1, %v1, %v3
+; CHECK-NEXT:    vlgvf %r13, %v1, 0
 ; CHECK-NEXT:    vchlf %v2, %v2, %v3
 ; CHECK-NEXT:    vlgvf %r3, %v2, 1
 ; CHECK-NEXT:    nilf %r3, 1
@@ -54,13 +54,13 @@ define dso_local void @m() local_unnamed_addr #1 {
 ; CHECK-NEXT:    nilf %r14, 1
 ; CHECK-NEXT:    rosbg %r2, %r14, 32, 51, 12
 ; CHECK-NEXT:    rosbg %r2, %r13, 52, 52, 11
-; CHECK-NEXT:    vlgvf %r13, %v0, 1
+; CHECK-NEXT:    vlgvf %r13, %v1, 1
 ; CHECK-NEXT:    rosbg %r2, %r13, 53, 53, 10
-; CHECK-NEXT:    vlgvf %r13, %v0, 2
+; CHECK-NEXT:    vlgvf %r13, %v1, 2
 ; CHECK-NEXT:    rosbg %r2, %r13, 54, 54, 9
-; CHECK-NEXT:    vlgvf %r13, %v0, 3
+; CHECK-NEXT:    vlgvf %r13, %v1, 3
 ; CHECK-NEXT:    rosbg %r2, %r13, 55, 55, 8
-; CHECK-NEXT:    vchlf %v0, %v1, %v3
+; CHECK-NEXT:    vchlf %v0, %v0, %v3
 ; CHECK-NEXT:    vlgvf %r13, %v0, 0
 ; CHECK-NEXT:    rosbg %r2, %r13, 56, 56, 7
 ; CHECK-NEXT:    vlgvf %r13, %v0, 1
diff --git a/llvm/test/CodeGen/WebAssembly/removed-terminator.ll b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
new file mode 100644
index 000000000000..188f6f67eee8
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/removed-terminator.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O0 -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+define void @test(i1 %x) {
+; CHECK-LABEL: test:
+; CHECK:         .functype test (i32) -> ()
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32.const -1
+; CHECK-NEXT:    i32.xor
+; CHECK-NEXT:    i32.const 1
+; CHECK-NEXT:    i32.and
+; CHECK-NEXT:    drop
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    return
+  %y = xor i1 %x, true
+  ; This br_if's operand (%y) is stackified in RegStackify. But this terminator
+  ; will be removed in CFGSort after that. We need to make sure we unstackify %y
+  ; so that it can be dropped in ExplicitLocals.
+  br i1 %y, label %exit, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll
index f6d66ab47ce0..d9064c684cb2 100644
--- a/llvm/test/CodeGen/X86/abds-neg.ll
+++ b/llvm/test/CodeGen/X86/abds-neg.ll
@@ -367,44 +367,49 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -438,44 +443,49 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll %eax, %esi
-; X86-NEXT:    cmovll %ebx, %edi
-; X86-NEXT:    cmovll %ebp, %edx
-; X86-NEXT:    cmovll (%esp), %ecx # 4-byte Folded Reload
-; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %eax
+; X86-NEXT:    sbbl 32(%ebp), %edx
+; X86-NEXT:    sbbl 36(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %ebx, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %edx
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %ebx, 4(%edx)
+; X86-NEXT:    movl %eax, 8(%edx)
+; X86-NEXT:    movl %edi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -639,55 +649,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovll %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovll %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovll %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovll 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovll 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovll %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovll 36(%ebp), %ebx
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -848,37 +862,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovgel (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovgel %ebx, %esi
-; X86-NEXT:    cmovgel %ebp, %ecx
-; X86-NEXT:    cmovgel %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %ebx
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovgel %edi, %esi
+; X86-NEXT:    cmovgel %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1058,15 +1076,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1089,15 +1107,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    xorl %edx, %ecx
 ; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    subl %esi, %eax
-; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    sbbl %esi, %edx
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1118,35 +1136,39 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1175,35 +1197,39 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    subl %edi, %ebp
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %ecx, %ebx
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %edx
+; X86-NEXT:    sbbl 48(%ebp), %ecx
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    sarl $31, %esi
+; X86-NEXT:    xorl %esi, %eax
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    xorl %esi, %edx
+; X86-NEXT:    xorl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    subl %edi, %ebx
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abds.ll b/llvm/test/CodeGen/X86/abds.ll
index 0356c2702a41..a1a4ba81ae49 100644
--- a/llvm/test/CodeGen/X86/abds.ll
+++ b/llvm/test/CodeGen/X86/abds.ll
@@ -343,37 +343,41 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -404,37 +408,41 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -585,37 +593,41 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -768,37 +780,41 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1027,35 +1043,38 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128:
@@ -1079,35 +1098,38 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_subnsw_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_subnsw_i128_undef:
@@ -1282,37 +1304,41 @@ define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    subl %edx, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    cmovll %ebx, %esi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    cmovll %eax, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    subl 24(%ebp), %ecx
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    sbbl 36(%ebp), %ebx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovll %edi, %esi
+; X86-NEXT:    cmovll %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu-neg.ll b/llvm/test/CodeGen/X86/abdu-neg.ll
index 6bda99c89a37..b7c34070f1af 100644
--- a/llvm/test/CodeGen/X86/abdu-neg.ll
+++ b/llvm/test/CodeGen/X86/abdu-neg.ll
@@ -355,39 +355,43 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -423,39 +427,43 @@ define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -621,55 +629,59 @@ define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %esi
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    sbbl %edi, %ecx
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    cmovbl %edx, %ecx
-; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    cmovbl %esi, %edx
-; X86-NEXT:    cmpl %esi, %eax
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %ebp, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    sbbl 44(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    cmovbl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    cmovbl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    cmovbl 28(%ebp), %eax
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    cmovbl %edi, %ecx
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl 32(%ebp), %edi
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    sbbl 36(%ebp), %edi
+; X86-NEXT:    cmovbl 36(%ebp), %ebx
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 24(%ebp), %esi
+; X86-NEXT:    subl %esi, %ecx
+; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    movl 8(%ebp), %edx
+; X86-NEXT:    movl %ecx, (%edx)
+; X86-NEXT:    movl %eax, 4(%edx)
+; X86-NEXT:    movl %edi, 8(%edx)
+; X86-NEXT:    movl %esi, 12(%edx)
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -827,39 +839,43 @@ define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    xorl %ebp, %ecx
-; X86-NEXT:    xorl %ebp, %esi
-; X86-NEXT:    xorl %ebp, %ebx
-; X86-NEXT:    xorl %ebp, %edx
-; X86-NEXT:    subl %ebp, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl %ebp, %ecx
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebx, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    subl 40(%ebp), %ecx
+; X86-NEXT:    sbbl 44(%ebp), %edi
+; X86-NEXT:    sbbl 48(%ebp), %esi
+; X86-NEXT:    sbbl 52(%ebp), %eax
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
+; X86-NEXT:    xorl %ebx, %eax
+; X86-NEXT:    xorl %ebx, %esi
+; X86-NEXT:    xorl %ebx, %edi
+; X86-NEXT:    xorl %ebx, %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebx, %esi
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    negl %ecx
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebx, 4(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/abdu.ll b/llvm/test/CodeGen/X86/abdu.ll
index 27acec32fd34..043c9155f52f 100644
--- a/llvm/test/CodeGen/X86/abdu.ll
+++ b/llvm/test/CodeGen/X86/abdu.ll
@@ -326,35 +326,38 @@ define i64 @abd_ext_i64_undef(i64 %a, i64 %b) nounwind {
 define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128:
@@ -381,35 +384,38 @@ define i128 @abd_ext_i128(i128 %a, i128 %b) nounwind {
 define i128 @abd_ext_i128_undef(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_ext_i128_undef:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_ext_i128_undef:
@@ -548,35 +554,38 @@ define i64 @abd_minmax_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_minmax_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_minmax_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_minmax_i128:
@@ -717,35 +726,38 @@ define i64 @abd_cmp_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_cmp_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_cmp_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_cmp_i128:
@@ -887,35 +899,38 @@ define i64 @abd_select_i64(i64 %a, i64 %b) nounwind {
 define i128 @abd_select_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: abd_select_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    xorl %ebx, %ecx
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    sbbl %ebx, %esi
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl 40(%ebp), %edi
+; X86-NEXT:    sbbl 44(%ebp), %esi
+; X86-NEXT:    sbbl 48(%ebp), %edx
+; X86-NEXT:    sbbl 52(%ebp), %ecx
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: abd_select_i128:
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index bae140abdf6b..e252d5953e60 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -144,31 +144,34 @@ define i128 @test_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %r = call i128 @llvm.abs.i128(i128 %a, i1 false)
   ret i128 %r
@@ -688,13 +691,17 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ;
 ; X86-LABEL: test_sextinreg_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
 ; X86-NEXT:    subl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
@@ -702,7 +709,9 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 ; X86-NEXT:    movl %ecx, 4(%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %shl = shl i128 %a, 64
   %ashr = ashr exact i128 %shl, 64
diff --git a/llvm/test/CodeGen/X86/add-sub-bool.ll b/llvm/test/CodeGen/X86/add-sub-bool.ll
index c2bfcf57185e..1df284fb9fe2 100644
--- a/llvm/test/CodeGen/X86/add-sub-bool.ll
+++ b/llvm/test/CodeGen/X86/add-sub-bool.ll
@@ -104,18 +104,21 @@ define i24 @test_i24_add_add_idx(i24 %x, i24 %y, i24 %z) nounwind {
 define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-LABEL: test_i128_add_add_idx:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    btl $5, {{[0-9]+}}(%esp)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    addl 24(%ebp), %esi
+; X86-NEXT:    adcl 28(%ebp), %edi
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl 36(%ebp), %edx
+; X86-NEXT:    btl $5, 64(%ebp)
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -124,8 +127,10 @@ define i128 @test_i128_add_add_idx(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128_add_add_idx:
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index 0eb2c630e681..f13627b55856 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -188,11 +188,11 @@ define void @split_i128(ptr %sret, i128 %x) {
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    subl $48, %esp
-; CHECK-NEXT:    movl 12(%ebp), %eax
+; CHECK-NEXT:    movl 24(%ebp), %eax
 ; CHECK-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT:    movl 16(%ebp), %ebx
-; CHECK-NEXT:    movl 20(%ebp), %esi
-; CHECK-NEXT:    movl 24(%ebp), %edi
+; CHECK-NEXT:    movl 28(%ebp), %ebx
+; CHECK-NEXT:    movl 32(%ebp), %esi
+; CHECK-NEXT:    movl 36(%ebp), %edi
 ; CHECK-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index 217ccebdfb77..0de308a9e073 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-LABEL: not_avg_v16i8_wide_constants:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps (%rdi), %xmm1
-; SSE2-NEXT:    movdqa (%rsi), %xmm2
+; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm3
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm5
@@ -1762,6 +1762,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
+; SSE2-NEXT:    movd %eax, %xmm10
+; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1771,9 +1774,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm12
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    decl %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
@@ -1783,43 +1783,45 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    decl %eax
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
 ; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
 ; SSE2-NEXT:    movapd %xmm4, %xmm5
 ; SSE2-NEXT:    andpd %xmm1, %xmm5
 ; SSE2-NEXT:    xorpd %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    paddw %xmm5, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE2-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1]
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; SSE2-NEXT:    movapd %xmm0, %xmm3
-; SSE2-NEXT:    andpd %xmm2, %xmm3
-; SSE2-NEXT:    xorpd %xmm0, %xmm2
-; SSE2-NEXT:    psrlw $1, %xmm2
-; SSE2-NEXT:    paddw %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
+; SSE2-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT:    movapd %xmm2, %xmm3
+; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    xorpd %xmm2, %xmm0
+; SSE2-NEXT:    psrlw $1, %xmm0
+; SSE2-NEXT:    paddw %xmm3, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm0, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
@@ -1829,74 +1831,75 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind {
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm5, %ecx
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX1-NEXT:    vpextrd $2, %xmm4, %eax
-; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm3, %ecx
+; AVX1-NEXT:    vpextrw $5, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm4
-; AVX1-NEXT:    vpextrw $2, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm5
-; AVX1-NEXT:    vpextrw $1, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm6
-; AVX1-NEXT:    vpextrw $0, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm5
+; AVX1-NEXT:    vpextrw $1, %xmm3, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm6
+; AVX1-NEXT:    vpextrw $0, %xmm3, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm7
-; AVX1-NEXT:    vpextrw $3, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm8
-; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    vpextrw $3, %xmm3, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm8
+; AVX1-NEXT:    vpextrw $2, %xmm3, %ecx
+; AVX1-NEXT:    decq %rax
+; AVX1-NEXT:    vmovq %rax, %xmm3
+; AVX1-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm9
-; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm10
-; AVX1-NEXT:    vpextrw $0, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm11
-; AVX1-NEXT:    vpextrw $5, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $6, %xmm2, %edx
+; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    vmovd %ecx, %xmm10
+; AVX1-NEXT:    vpextrw $5, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm11
+; AVX1-NEXT:    vpextrw $4, %xmm2, %eax
 ; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vmovd %edx, %xmm12
-; AVX1-NEXT:    vpextrw $4, %xmm3, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm13
-; AVX1-NEXT:    vpextrw $5, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm14
-; AVX1-NEXT:    vpextrw $4, %xmm2, %edx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm15
-; AVX1-NEXT:    vpextrw $7, %xmm3, %edx
+; AVX1-NEXT:    vpextrw $1, %xmm2, %edx
 ; AVX1-NEXT:    decl %ecx
-; AVX1-NEXT:    vmovd %ecx, %xmm3
-; AVX1-NEXT:    vpextrw $7, %xmm2, %ecx
-; AVX1-NEXT:    decl %edx
-; AVX1-NEXT:    vmovd %edx, %xmm2
+; AVX1-NEXT:    vmovd %ecx, %xmm13
+; AVX1-NEXT:    vpextrw $0, %xmm2, %ecx
+; AVX1-NEXT:    decl %eax
+; AVX1-NEXT:    vmovd %eax, %xmm14
+; AVX1-NEXT:    vpextrw $3, %xmm2, %eax
+; AVX1-NEXT:    decq %rdx
+; AVX1-NEXT:    vmovq %rdx, %xmm15
+; AVX1-NEXT:    vpextrw $2, %xmm2, %edx
+; AVX1-NEXT:    decq %rcx
+; AVX1-NEXT:    vmovq %rcx, %xmm2
 ; AVX1-NEXT:    decl %eax
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
 ; AVX1-NEXT:    vmovd %eax, %xmm5
-; AVX1-NEXT:    decl %ecx
+; AVX1-NEXT:    decl %edx
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX1-NEXT:    vmovd %ecx, %xmm7
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm8, %ymm6
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX1-NEXT:    vmovddup {{.*#+}} ymm3 = ymm6[0,0,2,2]
-; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm5, %ymm2
-; AVX1-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5],ymm2[6,7]
+; AVX1-NEXT:    vmovd %edx, %xmm7
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2,3],xmm3[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,0,0,0]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3,4,5],xmm4[6,7]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5,6,7]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vandps %ymm0, %ymm2, %ymm1
 ; AVX1-NEXT:    vxorps %ymm0, %ymm2, %ymm0
diff --git a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
index e449c7192e4b..b60d7a5463d6 100644
--- a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll
@@ -278,14 +278,14 @@ define <4 x float> @test_int_x86_avx512fp16_maskz_cfcmadd_sh(<4 x float> %x0, <4
   ret <4 x float> %res
 }
 
-define <4 x float> @PR98306() {
+define <4 x float> @PR98306(i8 %m) {
 ; CHECK-LABEL: PR98306:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kxorw %k0, %k0, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm1 = [7.8125E-3,1.050912E+6,4.203776E+6,1.6815616E+7]
 ; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [3.2E+1,4.03288064E+8,8.0658432E+8,1.61318502E+9]
 ; CHECK-NEXT:    vfmaddcsh {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> <float 7.812500e-03, float 0x4130092000000000, float 0x4150094000000000, float 0x4170096000000000>, <4 x float> <float 2.000000e+00, float 0x4188098000000000, float 0x4198099000000000, float 0x41A809A000000000>, <4 x float> <float 3.200000e+01, float 0x41B809B000000000, float 0x41C809C000000000, float 0x41D809D000000000>, i8 0, i32 4)
+  %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> <float 7.812500e-03, float 0x4130092000000000, float 0x4150094000000000, float 0x4170096000000000>, <4 x float> <float 2.000000e+00, float 0x4188098000000000, float 0x4198099000000000, float 0x41A809A000000000>, <4 x float> <float 3.200000e+01, float 0x41B809B000000000, float 0x41C809C000000000, float 0x41D809D000000000>, i8 %m, i32 4)
   ret <4 x float> %res
 }
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
index f66f0c0ceabc..cc58bc1e44f3 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll
@@ -628,13 +628,19 @@ define half @s128_to_half(i128 %x) {
 ;
 ; X86-LABEL: s128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floattihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = sitofp i128 %x to half
   ret half %a
@@ -713,13 +719,19 @@ define half @u128_to_half(i128 %x) {
 ;
 ; X86-LABEL: u128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __floatuntihf
-; X86-NEXT:    addl $16, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
   %a = uitofp i128 %x to half
   ret half %a
@@ -1020,11 +1032,15 @@ define half @f128_to_half(fp128 %x) nounwind {
 ;
 ; X86-LABEL: f128_to_half:
 ; X86:       # %bb.0:
-; X86-NEXT:    subl $16, %esp
-; X86-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-NEXT:    vmovups %xmm0, (%esp)
 ; X86-NEXT:    calll __trunctfhf2
-; X86-NEXT:    addl $16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
   %a = fptrunc fp128 %x to half
   ret half %a
diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
index 627a94799424..b1bacd92f073 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll
@@ -1361,3 +1361,19 @@ define <32 x half> @test_mm512_castph256_ph512_freeze(<16 x half> %a0) nounwind
   %res = shufflevector <16 x half> %a0, <16 x half> %a1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   ret <32 x half> %res
 }
+
+define <8 x half> @PR153570(ptr %p) {
+; CHECK-LABEL: PR153570:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; CHECK-NEXT:    vmulsh {rn-sae}, %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
+; CHECK-NEXT:    vmovsh %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vmovaps %xmm1, (%rdi)
+; CHECK-NEXT:    retq
+  %r = tail call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <8 x half> <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>, <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, i8 0, i32 8)
+  store <8 x half> %r, ptr %p, align 16
+  %r1 = tail call <8 x half> @llvm.x86.avx512fp16.mask.mul.sh.round(<8 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>, <8 x half> <half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000, half 0xH4000>, <8 x half> <half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000, half 0xH8000>, i8 1, i32 8)
+  ret <8 x half> %r1
+}
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827ac4dd..33381313d3c1 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -146,37 +146,40 @@ define i64 @bitselect_i64(i64 %a, i64 %b, i64 %m) nounwind {
 define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ; X86-LABEL: bitselect_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorl %edi, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %esi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    andl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    andl 56(%ebp), %ecx
+; X86-NEXT:    xorl %esi, %ecx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    andl 60(%ebp), %esi
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl 48(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    andl 64(%ebp), %edi
+; X86-NEXT:    xorl %edx, %edi
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    andl 68(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-NOBMI-LABEL: bitselect_i128:
diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
index 312f94c04123..143e10e6909e 100644
--- a/llvm/test/CodeGen/X86/bsf.ll
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -263,70 +263,78 @@ define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
 ; X86-NEXT:    testl %ecx, %ecx
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    rep bsfl %edi, %esi
-; X86-NEXT:    addl $32, %esi
-; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    je .LBB8_7
-; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    rep bsfl %eax, %edx
-; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    jmp .LBB8_5
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl $128, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    rep bsfl %ecx, %esi
-; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:  .LBB8_5: # %cond.false
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
-; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:  # %bb.7: # %cond.false
+; X86-NEXT:    rep bsfl %ebx, %edx
 ; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %edi, %edx
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
 ; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %ebx, %ebx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    jne .LBB8_13
-; X86-NEXT:  # %bb.12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:  .LBB8_13: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, 12(%eax)
-; X86-NEXT:    movl %ebx, 8(%eax)
-; X86-NEXT:    movl %ebp, 4(%eax)
-; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -361,46 +369,49 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsf128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    je .LBB9_11
 ; X86-NEXT:  # %bb.1: # %select.true.sink
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_2
 ; X86-NEXT:  # %bb.3: # %select.true.sink
-; X86-NEXT:    rep bsfl %ecx, %edi
-; X86-NEXT:    addl $32, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %ecx, %ebx
+; X86-NEXT:    addl $32, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB9_6
 ; X86-NEXT:  .LBB9_5:
-; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    rep bsfl %edi, %esi
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    je .LBB9_8
 ; X86-NEXT:    jmp .LBB9_9
 ; X86-NEXT:  .LBB9_11: # %select.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 52(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    movl %edi, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
 ; X86-NEXT:    jmp .LBB9_10
 ; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    rep bsfl %edx, %edi
-; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    rep bsfl %edx, %ebx
+; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB9_5
 ; X86-NEXT:  .LBB9_6: # %select.true.sink
 ; X86-NEXT:    rep bsfl %esi, %esi
@@ -409,13 +420,14 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  .LBB9_8: # %select.true.sink
 ; X86-NEXT:    addl $64, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebx
 ; X86-NEXT:  .LBB9_9: # %select.true.sink
-; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %ebx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
 ; X86-NEXT:    movl $0, 4(%eax)
 ; X86-NEXT:  .LBB9_10: # %select.true.sink
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
index fbca4af425ea..ab0478a4e944 100644
--- a/llvm/test/CodeGen/X86/bsr.ll
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -291,79 +291,80 @@ define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    orl %ebp, %edx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    je .LBB8_1
 ; X86-NEXT:  # %bb.2: # %cond.false
-; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    jne .LBB8_3
 ; X86-NEXT:  # %bb.4: # %cond.false
-; X86-NEXT:    bsrl %ebx, %edx
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    bsrl %ebx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    je .LBB8_7
 ; X86-NEXT:  .LBB8_6:
-; X86-NEXT:    bsrl %edi, %esi
-; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    bsrl %edi, %eax
+; X86-NEXT:    xorl $31, %eax
 ; X86-NEXT:    jmp .LBB8_8
 ; X86-NEXT:  .LBB8_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    movl $128, %esi
 ; X86-NEXT:    jmp .LBB8_11
 ; X86-NEXT:  .LBB8_3:
-; X86-NEXT:    bsrl %ebp, %edx
-; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl %esi, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    jne .LBB8_6
 ; X86-NEXT:  .LBB8_7: # %cond.false
-; X86-NEXT:    bsrl %ecx, %esi
-; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    orl $32, %esi
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:  .LBB8_8: # %cond.false
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    orl 36(%ebp), %edx
 ; X86-NEXT:    jne .LBB8_10
 ; X86-NEXT:  # %bb.9: # %cond.false
-; X86-NEXT:    orl $64, %esi
-; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:  .LBB8_10: # %cond.false
-; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    xorl %eax, %eax
 ; X86-NEXT:  .LBB8_11: # %cond.end
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl 32(%ebp), %ecx
+; X86-NEXT:    orl 36(%ebp), %edi
 ; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    je .LBB8_12
 ; X86-NEXT:  # %bb.13: # %cond.end
-; X86-NEXT:    xorl $127, %edx
-; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    jmp .LBB8_14
 ; X86-NEXT:  .LBB8_12:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    movl 48(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %esi
 ; X86-NEXT:  .LBB8_14: # %cond.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebx, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
-; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -398,62 +399,67 @@ define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: cmov_bsr128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    testl %eax, %eax
 ; X86-NEXT:    jne .LBB9_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    bsrl %esi, %ecx
-; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
 ; X86-NEXT:    jmp .LBB9_3
 ; X86-NEXT:  .LBB9_1:
-; X86-NEXT:    bsrl %edi, %ecx
-; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    bsrl %eax, %esi
+; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:  .LBB9_3:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    testl %edx, %edx
 ; X86-NEXT:    jne .LBB9_4
 ; X86-NEXT:  # %bb.5:
-; X86-NEXT:    bsrl %ebx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:    orl $32, %ebp
-; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    je .LBB9_7
+; X86-NEXT:    jmp .LBB9_8
 ; X86-NEXT:  .LBB9_4:
-; X86-NEXT:    bsrl %edx, %ebp
-; X86-NEXT:    xorl $31, %ebp
-; X86-NEXT:  .LBB9_6:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    bsrl %edx, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl %eax, %edi
 ; X86-NEXT:    jne .LBB9_8
-; X86-NEXT:  # %bb.7:
-; X86-NEXT:    orl $64, %ebp
-; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_7:
+; X86-NEXT:    orl $64, %ecx
+; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:  .LBB9_8:
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    orl 32(%ebp), %ebx
 ; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    jne .LBB9_9
 ; X86-NEXT:  # %bb.10:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl 52(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %ecx
 ; X86-NEXT:    jmp .LBB9_11
 ; X86-NEXT:  .LBB9_9:
-; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl $127, %esi
+; X86-NEXT:    xorl %ecx, %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    xorl %edi, %edi
 ; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/bswap-wide-int.ll b/llvm/test/CodeGen/X86/bswap-wide-int.ll
index 6d5e995a6d57..673b7f16de75 100644
--- a/llvm/test/CodeGen/X86/bswap-wide-int.ll
+++ b/llvm/test/CodeGen/X86/bswap-wide-int.ll
@@ -41,13 +41,16 @@ define i64 @bswap_i64(i64 %a0) nounwind {
 define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-LABEL: bswap_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    bswapl %edi
 ; X86-NEXT:    bswapl %esi
 ; X86-NEXT:    bswapl %edx
@@ -56,25 +59,32 @@ define i128 @bswap_i128(i128 %a0) nounwind {
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X86-MOVBE-LABEL: bswap_i128:
 ; X86-MOVBE:       # %bb.0:
+; X86-MOVBE-NEXT:    pushl %ebp
+; X86-MOVBE-NEXT:    movl %esp, %ebp
 ; X86-MOVBE-NEXT:    pushl %edi
 ; X86-MOVBE-NEXT:    pushl %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-MOVBE-NEXT:    andl $-16, %esp
+; X86-MOVBE-NEXT:    movl 8(%ebp), %eax
+; X86-MOVBE-NEXT:    movl 32(%ebp), %ecx
+; X86-MOVBE-NEXT:    movl 36(%ebp), %edx
+; X86-MOVBE-NEXT:    movl 24(%ebp), %esi
+; X86-MOVBE-NEXT:    movl 28(%ebp), %edi
 ; X86-MOVBE-NEXT:    movbel %esi, 12(%eax)
 ; X86-MOVBE-NEXT:    movbel %edi, 8(%eax)
 ; X86-MOVBE-NEXT:    movbel %ecx, 4(%eax)
 ; X86-MOVBE-NEXT:    movbel %edx, (%eax)
+; X86-MOVBE-NEXT:    leal -8(%ebp), %esp
 ; X86-MOVBE-NEXT:    popl %esi
 ; X86-MOVBE-NEXT:    popl %edi
+; X86-MOVBE-NEXT:    popl %ebp
 ; X86-MOVBE-NEXT:    retl $4
 ;
 ; X64-LABEL: bswap_i128:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index d869f8ec01a5..455b72d16a07 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -152,17 +152,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $176, %esp
-; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    xorl %eax, %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    xorl %eax, %ecx
 ; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -172,16 +172,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 48(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %ebx
 ; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %ebx
@@ -204,45 +203,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
-; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %edi, %edi
 ; X86-NEXT:    xorl $31, %edi
-; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    orl $32, %edi
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %edi
-; X86-NEXT:    addl $64, %edi
+; X86-NEXT:    orl $64, %edi
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %edi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    bsrl %eax, %ecx
+; X86-NEXT:    bsrl %eax, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %eax, %eax
-; X86-NEXT:    cmovel %edx, %ecx
+; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
 ; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    orl $64, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -380,9 +379,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -488,13 +487,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sbbl %ecx, %ebx
 ; X86-NEXT:    sbbl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebp), %ecx
+; X86-NEXT:    movl 56(%ebp), %ecx
 ; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    movl %eax, 4(%ecx)
 ; X86-NEXT:    movl %ebx, 8(%ecx)
 ; X86-NEXT:    movl %esi, 12(%ecx)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -508,7 +507,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %esi
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -523,17 +522,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    imull %eax, %ebx
 ; X86-NEXT:    mull %edi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    imull %esi, %edi
 ; X86-NEXT:    addl %edx, %edi
 ; X86-NEXT:    addl %ebx, %edi
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    imull %edx, %ebx
 ; X86-NEXT:    mull %edx
@@ -543,13 +542,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
 ; X86-NEXT:    sbbl %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 7bbddefd8272..adcfee5959af 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -152,60 +152,60 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
 ; X86-NEXT:    subl $160, %esp
-; X86-NEXT:    movl 28(%ebp), %ebx
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    orl 36(%ebp), %ecx
+; X86-NEXT:    orl 48(%ebp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
-; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    orl 24(%ebp), %eax
-; X86-NEXT:    movl 12(%ebp), %edx
-; X86-NEXT:    orl 20(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    orl 36(%ebp), %eax
+; X86-NEXT:    movl 24(%ebp), %edx
+; X86-NEXT:    orl 32(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl 36(%ebp), %ecx
+; X86-NEXT:    bsrl 48(%ebp), %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    bsrl %ebx, %eax
 ; X86-NEXT:    xorl $31, %eax
-; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl $32, %eax
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %edx, %eax
-; X86-NEXT:    addl $64, %eax
-; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    orl $64, %eax
+; X86-NEXT:    movl 48(%ebp), %edx
 ; X86-NEXT:    orl %esi, %edx
 ; X86-NEXT:    cmovnel %ecx, %eax
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    bsrl %ecx, %ecx
 ; X86-NEXT:    xorl $31, %ecx
-; X86-NEXT:    addl $32, %ecx
+; X86-NEXT:    orl $32, %ecx
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    movl 16(%ebp), %edi
+; X86-NEXT:    movl 28(%ebp), %edi
 ; X86-NEXT:    bsrl %edi, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl 12(%ebp), %edx
+; X86-NEXT:    bsrl 24(%ebp), %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    addl $32, %edx
+; X86-NEXT:    orl $32, %edx
 ; X86-NEXT:    testl %edi, %edi
 ; X86-NEXT:    cmovnel %esi, %edx
-; X86-NEXT:    addl $64, %edx
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    orl $64, %edx
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    orl %ebx, %esi
 ; X86-NEXT:    cmovnel %ecx, %edx
@@ -237,30 +237,30 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    testb %cl, %cl
 ; X86-NEXT:    movb %cl, %ah
-; X86-NEXT:    movl 24(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
 ; X86-NEXT:    movl $0, %esi
 ; X86-NEXT:    cmovnel %esi, %ebx
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    cmovnel %esi, %ecx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    cmovnel %edx, %esi
-; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    movl %edi, %ecx
 ; X86-NEXT:    cmovnel %edx, %ecx
 ; X86-NEXT:    orb %ah, %al
-; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 56(%ebp), %eax
 ; X86-NEXT:    jne .LBB4_7
 ; X86-NEXT:  # %bb.1: # %udiv-bb1
 ; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl %edx, %ecx
@@ -289,7 +289,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    adcl $0, %esi
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 20(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %ebx
 ; X86-NEXT:    jae .LBB4_2
 ; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    xorl %edx, %edx
@@ -299,13 +299,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:  .LBB4_2: # %udiv-preheader
 ; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edx
 ; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
@@ -334,16 +334,16 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrdl %cl, %eax, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl 44(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    adcl $-1, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl 52(%ebp), %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    xorl %eax, %eax
@@ -391,13 +391,13 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl $1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 40(%ebp), %eax
+; X86-NEXT:    andl 52(%ebp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    andl 36(%ebp), %eax
+; X86-NEXT:    andl 48(%ebp), %eax
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    andl 32(%ebp), %edx
-; X86-NEXT:    andl 28(%ebp), %ecx
+; X86-NEXT:    andl 44(%ebp), %edx
+; X86-NEXT:    andl 40(%ebp), %ecx
 ; X86-NEXT:    subl %ecx, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
@@ -437,7 +437,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    orl %eax, %esi
 ; X86-NEXT:    addl %ecx, %ecx
 ; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl 44(%ebp), %eax
+; X86-NEXT:    movl 56(%ebp), %eax
 ; X86-NEXT:  .LBB4_7: # %udiv-end
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
@@ -446,23 +446,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %ebx, 12(%eax)
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 52(%ebp), %edi
 ; X86-NEXT:    imull %ecx, %edi
 ; X86-NEXT:    addl %edx, %edi
-; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 40(%ebp), %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    mull %esi
 ; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    imull 28(%ebp), %ebx
+; X86-NEXT:    imull 40(%ebp), %ebx
 ; X86-NEXT:    addl %edx, %ebx
-; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 44(%ebp), %edx
 ; X86-NEXT:    imull %edx, %esi
 ; X86-NEXT:    addl %ebx, %esi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
@@ -471,7 +471,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %ecx
 ; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -483,26 +483,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    mull 32(%ebp)
-; X86-NEXT:    movl 16(%ebp), %esi
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    mull 32(%ebp)
+; X86-NEXT:    mull 44(%ebp)
 ; X86-NEXT:    addl %edi, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%ebp), %ebx
+; X86-NEXT:    movl 24(%ebp), %ebx
 ; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%ebp), %edi
+; X86-NEXT:    movl 32(%ebp), %edi
 ; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ebx, (%eax)
diff --git a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
similarity index 99%
rename from llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
rename to llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
index 17de405928d3..3810e227bea9 100644
--- a/llvm/test/CodeGen/X86/tail-dup-computed-goto.mir
+++ b/llvm/test/CodeGen/X86/early-tail-dup-computed-goto.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 # RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=early-tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
-# Check that only the computed goto is not be restrict by tail-dup-pred-size and tail-dup-succ-size.
+#
+# Check that only the computed goto and others are restricted by tail-dup-pred-size and tail-dup-succ-size.
+#
 --- |
   @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
   declare i64 @f0()
diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
index 707b05f3478d..bb5640aeb66f 100644
--- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll
@@ -481,18 +481,21 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -501,7 +504,7 @@ define i128 @fptosi_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -620,18 +623,21 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -640,7 +646,7 @@ define i128 @fptoui_i128(fp128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -818,18 +824,21 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -838,7 +847,7 @@ define fp128 @sitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
@@ -1016,18 +1025,21 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1036,7 +1048,7 @@ define fp128 @uitofp_i128(i128 %x) nounwind strictfp {
 ; X86-NEXT:    movl %eax, (%esi)
 ; X86-NEXT:    movl %ecx, 4(%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
diff --git a/llvm/test/CodeGen/X86/fp128-cast.ll b/llvm/test/CodeGen/X86/fp128-cast.ll
index 1de2484d47ba..6d4ec063ccd4 100644
--- a/llvm/test/CodeGen/X86/fp128-cast.ll
+++ b/llvm/test/CodeGen/X86/fp128-cast.ll
@@ -415,16 +415,20 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-LABEL: TestFPToSIF128_I128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixtfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -432,7 +436,7 @@ define dso_local void @TestFPToSIF128_I128() nounwind {
 ; X86-NEXT:    movl %edx, vi128+8
 ; X86-NEXT:    movl %ecx, vi128+4
 ; X86-NEXT:    movl %eax, vi128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -466,16 +470,20 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-LABEL: TestFPToUIF128_U128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vf128, %eax
+; X86-NEXT:    movl vf128+4, %ecx
+; X86-NEXT:    movl vf128+8, %edx
+; X86-NEXT:    movl vf128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __fixunstfti
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -483,7 +491,7 @@ define dso_local void @TestFPToUIF128_U128() nounwind {
 ; X86-NEXT:    movl %edx, vu128+8
 ; X86-NEXT:    movl %ecx, vu128+4
 ; X86-NEXT:    movl %eax, vu128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -913,16 +921,20 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-LABEL: TestSIToFPI128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vi128, %eax
+; X86-NEXT:    movl vi128+4, %ecx
+; X86-NEXT:    movl vi128+8, %edx
+; X86-NEXT:    movl vi128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vi128+12
-; X86-NEXT:    pushl vi128+8
-; X86-NEXT:    pushl vi128+4
-; X86-NEXT:    pushl vi128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floattitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -930,7 +942,7 @@ define dso_local void @TestSIToFPI128_F128() nounwind {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -964,16 +976,20 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-LABEL: TestUIToFPU128_F128:
 ; X86:       # %bb.0: # %entry
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $36, %esp
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl vu128, %eax
+; X86-NEXT:    movl vu128+4, %ecx
+; X86-NEXT:    movl vu128+8, %edx
+; X86-NEXT:    movl vu128+12, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl vu128+12
-; X86-NEXT:    pushl vu128+8
-; X86-NEXT:    pushl vu128+4
-; X86-NEXT:    pushl vu128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __floatuntitf
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,7 +997,7 @@ define dso_local void @TestUIToFPU128_F128() #2 {
 ; X86-NEXT:    movl %edx, vf128+8
 ; X86-NEXT:    movl %ecx, vf128+4
 ; X86-NEXT:    movl %eax, vf128
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $56, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
@@ -1134,33 +1150,30 @@ define dso_local i32 @TestBits128(fp128 %ld) nounwind {
 ;
 ; X86-LABEL: TestBits128:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $20, %esp
+; X86-NEXT:    subl $72, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl $12, %esp
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    orl (%esp), %ecx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sete %al
-; X86-NEXT:    addl $20, %esp
+; X86-NEXT:    addl $72, %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: TestBits128:
@@ -1359,12 +1372,14 @@ define i1 @PR34866(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866:
@@ -1394,12 +1409,14 @@ define i1 @PR34866_commute(i128 %x) nounwind {
 ;
 ; X86-LABEL: PR34866_commute:
 ; X86:       # %bb.0:
+; X86-NEXT:    subl $12, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl
 ;
 ; X64-AVX-LABEL: PR34866_commute:
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index a7eea04181f6..ad2d690fd7ed 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -41,27 +41,40 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: add:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: add:
@@ -81,24 +94,32 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -107,9 +128,10 @@ define fp128 @add(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -141,27 +163,40 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: sub:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sub:
@@ -181,24 +216,32 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -207,9 +250,10 @@ define fp128 @sub(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,27 +285,40 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: mul:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: mul:
@@ -281,24 +338,32 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -307,9 +372,10 @@ define fp128 @mul(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -341,27 +407,40 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: div:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: div:
@@ -381,24 +460,32 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -407,9 +494,10 @@ define fp128 @div(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -434,31 +522,48 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ;
 ; X86-LABEL: fma:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: fma:
@@ -481,28 +586,40 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -511,9 +628,10 @@ define fp128 @fma(fp128 %x, fp128 %y, fp128 %z) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -538,27 +656,40 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: frem:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: frem:
@@ -578,24 +709,32 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -604,9 +743,10 @@ define fp128 @frem(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -631,23 +771,28 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: ceil:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: ceil:
@@ -667,17 +812,20 @@ define fp128 @ceil(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -713,23 +861,28 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: acos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: acos:
@@ -749,17 +902,20 @@ define fp128 @acos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -795,23 +951,28 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cos:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cos:
@@ -831,17 +992,20 @@ define fp128 @cos(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -877,23 +1041,28 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: cosh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: cosh:
@@ -913,17 +1082,20 @@ define fp128 @cosh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -959,23 +1131,28 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll expf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp:
@@ -995,17 +1172,20 @@ define fp128 @exp(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _expl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1041,23 +1221,28 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: exp2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll exp2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: exp2:
@@ -1077,17 +1262,20 @@ define fp128 @exp2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _exp2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1123,23 +1311,28 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: floor:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: floor:
@@ -1159,17 +1352,20 @@ define fp128 @floor(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1205,23 +1401,28 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll logf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log:
@@ -1241,17 +1442,20 @@ define fp128 @log(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _logl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1287,23 +1491,28 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log10:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log10f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log10:
@@ -1323,17 +1532,20 @@ define fp128 @log10(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log10l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1369,23 +1581,28 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: log2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll log2f128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: log2:
@@ -1405,17 +1622,20 @@ define fp128 @log2(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _log2l
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1451,27 +1671,40 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: maxnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaxf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: maxnum:
@@ -1491,24 +1724,32 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmaxl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1517,9 +1758,10 @@ define fp128 @maxnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1544,27 +1786,40 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: minnum:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fminf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: minnum:
@@ -1584,24 +1839,32 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fminl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1610,9 +1873,10 @@ define fp128 @minnum(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1637,23 +1901,28 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: nearbyint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: nearbyint:
@@ -1673,17 +1942,20 @@ define fp128 @nearbyint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1719,27 +1991,40 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: pow:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll powf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: pow:
@@ -1759,24 +2044,32 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _powl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1785,9 +2078,10 @@ define fp128 @pow(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1819,24 +2113,32 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ;
 ; X86-LABEL: powi:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $64, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __powitf2
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $64, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: powi:
@@ -1853,21 +2155,26 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___powitf2
-; WIN-X86-NEXT:    addl $24, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1876,9 +2183,10 @@ define fp128 @powi(fp128 %x, i32 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1903,23 +2211,28 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: rint:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: rint:
@@ -1939,17 +2252,20 @@ define fp128 @rint(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1985,23 +2301,28 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: round:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: round:
@@ -2021,17 +2342,20 @@ define fp128 @round(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2067,23 +2391,28 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: roundeven:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundevenf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: roundeven:
@@ -2103,17 +2432,20 @@ define fp128 @roundeven(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundevenl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2149,23 +2481,28 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: asin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: asin:
@@ -2185,17 +2522,20 @@ define fp128 @asin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2231,23 +2571,28 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sin:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sin:
@@ -2267,17 +2612,20 @@ define fp128 @sin(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2313,23 +2661,28 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sinh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sinh:
@@ -2349,17 +2702,20 @@ define fp128 @sinh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2395,23 +2751,28 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: sqrt:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: sqrt:
@@ -2431,17 +2792,20 @@ define fp128 @sqrt(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2477,23 +2841,28 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: atan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan:
@@ -2513,17 +2882,20 @@ define fp128 @atan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2559,27 +2931,40 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ;
 ; X86-LABEL: atan2:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: atan2:
@@ -2599,24 +2984,32 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2625,9 +3018,10 @@ define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -2652,23 +3046,28 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tan:
@@ -2688,17 +3087,20 @@ define fp128 @tan(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2734,23 +3136,28 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: tanh:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: tanh:
@@ -2770,17 +3177,20 @@ define fp128 @tanh(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2816,23 +3226,28 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ;
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: trunc:
@@ -2852,17 +3267,20 @@ define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2919,12 +3337,18 @@ define i32 @lrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i32 @llvm.experimental.constrained.lrint.i32.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -2969,12 +3393,18 @@ define i64 @llrint(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llrint:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llrintl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %rint = call i64 @llvm.experimental.constrained.llrint.i64.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
@@ -3019,12 +3449,18 @@ define i32 @lround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: lround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _lroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i32 @llvm.experimental.constrained.lround.i32.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3069,12 +3505,18 @@ define i64 @llround(fp128 %x) nounwind strictfp {
 ;
 ; WIN-X86-LABEL: llround:
 ; WIN-X86:       # %bb.0: # %entry
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 20(%ebp)
+; WIN-X86-NEXT:    pushl 16(%ebp)
+; WIN-X86-NEXT:    pushl 12(%ebp)
+; WIN-X86-NEXT:    pushl 8(%ebp)
 ; WIN-X86-NEXT:    calll _llroundl
 ; WIN-X86-NEXT:    addl $16, %esp
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
   %round = call i64 @llvm.experimental.constrained.llround.i64.f128(fp128 %x, metadata !"fpexcept.strict") #0
@@ -3176,26 +3618,32 @@ define i64 @cmp(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmp:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB37_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB37_3
 ; WIN-X86-NEXT:  LBB37_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB37_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmp.f128(
                                                fp128 %x, fp128 %y,
@@ -3300,26 +3748,32 @@ define i64 @cmps(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ;
 ; WIN-X86-LABEL: cmps:
 ; WIN-X86:       # %bb.0:
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
+; WIN-X86-NEXT:    pushl 36(%ebp)
+; WIN-X86-NEXT:    pushl 32(%ebp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    je LBB38_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB38_3
 ; WIN-X86-NEXT:  LBB38_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB38_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    movl %ebp, %esp
+; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %cond = call i1 @llvm.experimental.constrained.fcmps.f128(
                                                fp128 %x, fp128 %y,
@@ -3496,44 +3950,47 @@ define i64 @cmp_ueq_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_ueq_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    sete %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    orb %bl, %al
 ; WIN-X86-NEXT:    jne LBB39_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB39_3
 ; WIN-X86-NEXT:  LBB39_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB39_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
@@ -3716,32 +4173,34 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-LABEL: cmp_one_q:
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
+; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    andl $-16, %esp
+; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    movl 32(%ebp), %edi
+; WIN-X86-NEXT:    movl 36(%ebp), %esi
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___eqtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
 ; WIN-X86-NEXT:    setne %bl
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 52(%ebp)
+; WIN-X86-NEXT:    pushl 48(%ebp)
+; WIN-X86-NEXT:    pushl 44(%ebp)
+; WIN-X86-NEXT:    pushl 40(%ebp)
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    pushl %edi
-; WIN-X86-NEXT:    pushl %ebp
-; WIN-X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    pushl 28(%ebp)
+; WIN-X86-NEXT:    pushl 24(%ebp)
 ; WIN-X86-NEXT:    calll ___unordtf2
 ; WIN-X86-NEXT:    addl $32, %esp
 ; WIN-X86-NEXT:    testl %eax, %eax
@@ -3749,13 +4208,14 @@ define i64 @cmp_one_q(i64 %a, i64 %b, fp128 %x, fp128 %y) #0 {
 ; WIN-X86-NEXT:    testb %bl, %al
 ; WIN-X86-NEXT:    jne LBB40_1
 ; WIN-X86-NEXT:  # %bb.2:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 16(%ebp), %ecx
 ; WIN-X86-NEXT:    jmp LBB40_3
 ; WIN-X86-NEXT:  LBB40_1:
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; WIN-X86-NEXT:    leal 8(%ebp), %ecx
 ; WIN-X86-NEXT:  LBB40_3:
 ; WIN-X86-NEXT:    movl (%ecx), %eax
 ; WIN-X86-NEXT:    movl 4(%ecx), %edx
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
 ; WIN-X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls.ll b/llvm/test/CodeGen/X86/fp128-libcalls.ll
index f727a7907862..4b0449fd7502 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls.ll
@@ -42,22 +42,38 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Add:
@@ -78,22 +94,31 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -101,8 +126,10 @@ define dso_local void @Test128Add(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -144,22 +171,38 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Add:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __addtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Add:
@@ -180,22 +223,31 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___addtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -203,8 +255,10 @@ define dso_local void @Test128_1Add(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -241,22 +295,38 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sub:
@@ -277,22 +347,31 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -300,8 +379,10 @@ define dso_local void @Test128Sub(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -343,22 +424,38 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Sub:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __subtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Sub:
@@ -379,22 +476,31 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___subtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -402,8 +508,10 @@ define dso_local void @Test128_1Sub(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -440,22 +548,38 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Mul:
@@ -476,22 +600,31 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -499,8 +632,10 @@ define dso_local void @Test128Mul(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -542,22 +677,38 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Mul:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __multf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Mul:
@@ -578,22 +729,31 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___multf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -601,8 +761,10 @@ define dso_local void @Test128_1Mul(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -639,22 +801,38 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Div:
@@ -675,22 +853,31 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -698,8 +885,10 @@ define dso_local void @Test128Div(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -741,22 +930,38 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Div:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll __divtf3
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Div:
@@ -777,22 +982,31 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll ___divtf3
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -800,8 +1014,10 @@ define dso_local void @Test128_1Div(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -830,22 +1046,38 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ;
 ; X86-LABEL: Test128Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rem:
@@ -866,22 +1098,31 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %edi
+; WIN-X86-NEXT:    movl 28(%ebp), %ebx
+; WIN-X86-NEXT:    movl 32(%ebp), %ecx
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -889,8 +1130,10 @@ define dso_local void @Test128Rem(fp128 %d1, fp128 %d2) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+8
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
 ; WIN-X86-NEXT:    movl %eax, _vf128
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -922,22 +1165,38 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128_1Rem:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $76, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl vf128, %edi
+; X86-NEXT:    movl vf128+4, %ebx
+; X86-NEXT:    movl vf128+8, %ebp
+; X86-NEXT:    movl vf128+12, %eax
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl vf128+12
-; X86-NEXT:    pushl vf128+8
-; X86-NEXT:    pushl vf128+4
-; X86-NEXT:    pushl vf128
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmodf128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $76, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128_1Rem:
@@ -958,22 +1217,31 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
+; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl _vf128+12
-; WIN-X86-NEXT:    pushl _vf128+8
-; WIN-X86-NEXT:    pushl _vf128+4
-; WIN-X86-NEXT:    pushl _vf128
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $80, %esp
+; WIN-X86-NEXT:    movl 16(%ebp), %esi
+; WIN-X86-NEXT:    movl 20(%ebp), %edi
+; WIN-X86-NEXT:    movl _vf128, %edx
+; WIN-X86-NEXT:    movl _vf128+4, %ebx
+; WIN-X86-NEXT:    movl _vf128+8, %ecx
+; WIN-X86-NEXT:    movl _vf128+12, %eax
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 12(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmodl
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -981,8 +1249,10 @@ define dso_local void @Test128_1Rem(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %edx, _vf128+12
 ; WIN-X86-NEXT:    movl %eax, _vf128
 ; WIN-X86-NEXT:    movl %ecx, _vf128+4
-; WIN-X86-NEXT:    leal -4(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
+; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1011,18 +1281,24 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sqrt:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sqrtf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sqrt:
@@ -1042,16 +1318,19 @@ define dso_local void @Test128Sqrt(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sqrtl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1089,18 +1368,24 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Sin:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Sin:
@@ -1120,16 +1405,19 @@ define dso_local void @Test128Sin(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1167,18 +1455,24 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Cos:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll cosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Cos:
@@ -1198,16 +1492,19 @@ define dso_local void @Test128Cos(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _cosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1245,18 +1542,24 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Ceil:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll ceilf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Ceil:
@@ -1276,16 +1579,19 @@ define dso_local void @Test128Ceil(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _ceill
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1323,18 +1629,24 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Floor:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll floorf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Floor:
@@ -1354,16 +1666,19 @@ define dso_local void @Test128Floor(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _floorl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1401,18 +1716,24 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Trunc:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll truncf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Trunc:
@@ -1432,16 +1753,19 @@ define dso_local void @Test128Trunc(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _truncl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1479,18 +1803,24 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Nearbyint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll nearbyintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Nearbyint:
@@ -1510,16 +1840,19 @@ define dso_local void @Test128Nearbyint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _nearbyintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1557,18 +1890,24 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Rint:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll rintf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Rint:
@@ -1588,16 +1927,19 @@ define dso_local void @Test128Rint(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _rintl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1635,18 +1977,24 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ;
 ; X86-LABEL: Test128Round:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $56, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll roundf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, vf128
-; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    addl $56, %esp
+; X86-NEXT:    popl %esi
 ; X86-NEXT:    retl
 ;
 ; WIN-LABEL: Test128Round:
@@ -1666,16 +2014,19 @@ define dso_local void @Test128Round(fp128 %d1) nounwind {
 ; WIN-X86-NEXT:    movl %esp, %ebp
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $32, %esp
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl 8(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    movl 8(%ebp), %eax
+; WIN-X86-NEXT:    movl 12(%ebp), %ecx
+; WIN-X86-NEXT:    movl 16(%ebp), %edx
+; WIN-X86-NEXT:    movl 20(%ebp), %esi
+; WIN-X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _roundl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
@@ -1705,31 +2056,48 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ;
 ; X86-LABEL: Test128FMA:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $92, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll fmaf128
-; X86-NEXT:    addl $60, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
-; X86-NEXT:    movaps %xmm0, (%esi)
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%ebp)
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    addl $92, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128FMA:
@@ -1752,28 +2120,40 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86:       # %bb.0: # %entry
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $96, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 56(%ebp)
-; WIN-X86-NEXT:    pushl 52(%ebp)
-; WIN-X86-NEXT:    pushl 48(%ebp)
-; WIN-X86-NEXT:    pushl 44(%ebp)
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 52(%ebp), %ebx
+; WIN-X86-NEXT:    movl 56(%ebp), %edi
+; WIN-X86-NEXT:    movl 60(%ebp), %edx
+; WIN-X86-NEXT:    movl 64(%ebp), %ecx
+; WIN-X86-NEXT:    movl 68(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 48(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 44(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 40(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 36(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _fmal
-; WIN-X86-NEXT:    addl $52, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1782,9 +2162,10 @@ define fp128 @Test128FMA(fp128 %a, fp128 %b, fp128 %c) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
 entry:
@@ -1804,23 +2185,28 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Acos:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll acosf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Acos:
@@ -1840,17 +2226,20 @@ define fp128 @Test128Acos(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _acosl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1879,23 +2268,28 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Asin:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll asinf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Asin:
@@ -1915,17 +2309,20 @@ define fp128 @Test128Asin(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _asinl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -1954,23 +2351,28 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Atan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan:
@@ -1990,17 +2392,20 @@ define fp128 @Test128Atan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2029,27 +2434,40 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ;
 ; X86-LABEL: Test128Atan2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $76, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll atan2f128
-; X86-NEXT:    addl $44, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $76, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Atan2:
@@ -2069,24 +2487,32 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86:       # %bb.0:
 ; WIN-X86-NEXT:    pushl %ebp
 ; WIN-X86-NEXT:    movl %esp, %ebp
+; WIN-X86-NEXT:    pushl %ebx
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $80, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 40(%ebp)
-; WIN-X86-NEXT:    pushl 36(%ebp)
-; WIN-X86-NEXT:    pushl 32(%ebp)
-; WIN-X86-NEXT:    pushl 28(%ebp)
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl 40(%ebp), %ebx
+; WIN-X86-NEXT:    movl 44(%ebp), %edx
+; WIN-X86-NEXT:    movl 48(%ebp), %ecx
+; WIN-X86-NEXT:    movl 52(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 32(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 28(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _atan2l
-; WIN-X86-NEXT:    addl $36, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2095,9 +2521,10 @@ define fp128 @Test128Atan2(fp128 %a, fp128 %b) nounwind {
 ; WIN-X86-NEXT:    movl %ecx, 4(%esi)
 ; WIN-X86-NEXT:    movl %eax, (%esi)
 ; WIN-X86-NEXT:    movl %esi, %eax
-; WIN-X86-NEXT:    leal -8(%ebp), %esp
+; WIN-X86-NEXT:    leal -12(%ebp), %esp
 ; WIN-X86-NEXT:    popl %esi
 ; WIN-X86-NEXT:    popl %edi
+; WIN-X86-NEXT:    popl %ebx
 ; WIN-X86-NEXT:    popl %ebp
 ; WIN-X86-NEXT:    retl
   %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
@@ -2115,23 +2542,28 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Cosh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll coshf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Cosh:
@@ -2151,17 +2583,20 @@ define fp128 @Test128Cosh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _coshl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2190,23 +2625,28 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Sinh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll sinhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Sinh:
@@ -2226,17 +2666,20 @@ define fp128 @Test128Sinh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _sinhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2265,23 +2708,28 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tan:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tan:
@@ -2301,17 +2749,20 @@ define fp128 @Test128Tan(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2340,23 +2791,28 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Tanh:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    subl $52, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll tanhf128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $52, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Tanh:
@@ -2376,17 +2832,20 @@ define fp128 @Test128Tanh(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $16, %esp
+; WIN-X86-NEXT:    subl $48, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
-; WIN-X86-NEXT:    movl %esp, %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %eax
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _tanhl
-; WIN-X86-NEXT:    addl $20, %esp
-; WIN-X86-NEXT:    movl (%esp), %eax
+; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2425,27 +2884,34 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ;
 ; X86-LABEL: Test128Modf:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    subl $40, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    calll modff128
-; X86-NEXT:    addl $28, %esp
-; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm0
 ; X86-NEXT:    movaps {{[0-9]+}}(%esp), %xmm1
 ; X86-NEXT:    movaps %xmm1, 16(%esi)
 ; X86-NEXT:    movaps %xmm0, (%esi)
 ; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    addl $40, %esp
+; X86-NEXT:    addl $80, %esp
 ; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl $4
 ;
 ; WIN-LABEL: Test128Modf:
@@ -2468,18 +2934,21 @@ define { fp128, fp128 } @Test128Modf(fp128 %a) nounwind {
 ; WIN-X86-NEXT:    pushl %edi
 ; WIN-X86-NEXT:    pushl %esi
 ; WIN-X86-NEXT:    andl $-16, %esp
-; WIN-X86-NEXT:    subl $64, %esp
+; WIN-X86-NEXT:    subl $112, %esp
 ; WIN-X86-NEXT:    movl 8(%ebp), %esi
+; WIN-X86-NEXT:    movl 24(%ebp), %eax
+; WIN-X86-NEXT:    movl 28(%ebp), %ecx
+; WIN-X86-NEXT:    movl 32(%ebp), %edx
+; WIN-X86-NEXT:    movl 36(%ebp), %edi
+; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; WIN-X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; WIN-X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; WIN-X86-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; WIN-X86-NEXT:    pushl %eax
-; WIN-X86-NEXT:    pushl 24(%ebp)
-; WIN-X86-NEXT:    pushl 20(%ebp)
-; WIN-X86-NEXT:    pushl 16(%ebp)
-; WIN-X86-NEXT:    pushl 12(%ebp)
-; WIN-X86-NEXT:    pushl %ecx
+; WIN-X86-NEXT:    movl %eax, (%esp)
 ; WIN-X86-NEXT:    calll _modfl
-; WIN-X86-NEXT:    addl $24, %esp
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; WIN-X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; WIN-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll
index 0f66d42697d9..953a5e7285fe 100644
--- a/llvm/test/CodeGen/X86/freeze-vector.ll
+++ b/llvm/test/CodeGen/X86/freeze-vector.ll
@@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    vmovdqa (%ecx), %xmm0
-; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%edx), %xmm0
+; X86-NEXT:    vpand (%ecx), %xmm0, %xmm0
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: freeze_extractelement:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
   %i0 = load <16 x i8>, ptr %origin0
@@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    vmovdqa (%edx), %xmm0
-; X86-NEXT:    vpand (%esi), %xmm0, %xmm0
+; X86-NEXT:    vmovdqa (%esi), %xmm0
+; X86-NEXT:    vpand (%edx), %xmm0, %xmm0
 ; X86-NEXT:    vmovdqa %xmm0, (%ecx)
 ; X86-NEXT:    vpextrb $6, %xmm0, (%eax)
 ; X86-NEXT:    popl %esi
@@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst,
 ;
 ; X64-LABEL: freeze_extractelement_escape:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
-; X64-NEXT:    vpand (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovdqa (%rdi), %xmm0
+; X64-NEXT:    vpand (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovdqa %xmm0, (%rcx)
 ; X64-NEXT:    vpextrb $6, %xmm0, (%rdx)
 ; X64-NEXT:    retq
@@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    movl 12(%ebp), %esi
 ; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    vmovaps (%esi), %xmm0
-; X86-NEXT:    vandps (%edi), %xmm0, %xmm0
+; X86-NEXT:    vmovaps (%edi), %xmm0
+; X86-NEXT:    vandps (%esi), %xmm0, %xmm0
 ; X86-NEXT:    vmovaps %xmm0, (%esp)
 ; X86-NEXT:    movzbl (%esp,%ecx), %ecx
 ; X86-NEXT:    cmpb (%esp,%eax), %cl
@@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id
 ; X64:       # %bb.0:
 ; X64-NEXT:    andl $15, %ecx
 ; X64-NEXT:    andl $15, %edx
-; X64-NEXT:    vmovaps (%rsi), %xmm0
-; X64-NEXT:    vandps (%rdi), %xmm0, %xmm0
+; X64-NEXT:    vmovaps (%rdi), %xmm0
+; X64-NEXT:    vandps (%rsi), %xmm0, %xmm0
 ; X64-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NEXT:    movzbl -24(%rsp,%rdx), %eax
 ; X64-NEXT:    cmpb -24(%rsp,%rcx), %al
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index e8c8ccfa8d37..ec1b8a3c8d6d 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -264,53 +264,62 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %edi
+; X86-FAST-NEXT:    movl 28(%ebp), %edx
+; X86-FAST-NEXT:    movl 48(%ebp), %esi
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %eax
 ; X86-FAST-NEXT:    jne .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %ebx, %ebp
 ; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl %edi, %eax
-; X86-FAST-NEXT:    movl %edx, %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-FAST-NEXT:    movl %edi, %esi
+; X86-FAST-NEXT:    movl 32(%ebp), %edi
+; X86-FAST-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edx, %eax
+; X86-FAST-NEXT:    movl 36(%ebp), %edx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %esi, %edx
-; X86-FAST-NEXT:    movl %edi, %esi
-; X86-FAST-NEXT:    movl %ebx, %edi
-; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl (%esp), %esi # 4-byte Reload
 ; X86-FAST-NEXT:    jmp .LBB6_6
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-FAST-NEXT:    movl 44(%ebp), %ebx
+; X86-FAST-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-FAST-NEXT:    movl 40(%ebp), %ebx
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_4
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    movl %eax, %ebp
+; X86-FAST-NEXT:    movl (%esp), %ebx # 4-byte Reload
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-FAST-NEXT:  .LBB6_6:
-; X86-FAST-NEXT:    movl %ebx, %eax
-; X86-FAST-NEXT:    shldl %cl, %ebp, %eax
-; X86-FAST-NEXT:    movl %edi, %ebp
-; X86-FAST-NEXT:    shldl %cl, %ebx, %ebp
-; X86-FAST-NEXT:    movl %esi, %ebx
-; X86-FAST-NEXT:    shldl %cl, %edi, %ebx
+; X86-FAST-NEXT:    movl %esi, %edi
+; X86-FAST-NEXT:    shldl %cl, %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %edx
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    shldl %cl, %esi, %ebx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-FAST-NEXT:    movl %eax, %esi
+; X86-FAST-NEXT:    shldl %cl, %edx, %esi
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shldl %cl, %esi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-FAST-NEXT:    movl %edx, 12(%ecx)
-; X86-FAST-NEXT:    movl %ebx, 8(%ecx)
-; X86-FAST-NEXT:    movl %ebp, 4(%ecx)
-; X86-FAST-NEXT:    movl %eax, (%ecx)
-; X86-FAST-NEXT:    movl %ecx, %eax
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-FAST-NEXT:    shldl %cl, %eax, %edx
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %edx, 12(%eax)
+; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    movl %ebx, 4(%eax)
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -320,77 +329,91 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    pushl %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $32, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %esi
+; X86-SLOW-NEXT:    movl 28(%ebp), %eax
+; X86-SLOW-NEXT:    movl 48(%ebp), %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    jne .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %ebx, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, %edx
+; X86-SLOW-NEXT:    movl 32(%ebp), %esi
 ; X86-SLOW-NEXT:    movl %edi, %ecx
-; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %al
-; X86-SLOW-NEXT:    je .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %edi, %ebx
-; X86-SLOW-NEXT:    movl %edx, %edi
-; X86-SLOW-NEXT:    movl %ecx, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl %eax, %edi
+; X86-SLOW-NEXT:    movl 36(%ebp), %eax
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $32, %al
+; X86-SLOW-NEXT:    movl 40(%ebp), %ecx
+; X86-SLOW-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %ecx
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ebx
+; X86-SLOW-NEXT:    testb $32, %bl
 ; X86-SLOW-NEXT:    jne .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ecx, %ebp
-; X86-SLOW-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ecx, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
 ; X86-SLOW-NEXT:    movl %edx, %esi
-; X86-SLOW-NEXT:    movl %eax, %ecx
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %ebp
-; X86-SLOW-NEXT:    movb %al, %ch
-; X86-SLOW-NEXT:    notb %ch
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    orl %esi, %ebp
-; X86-SLOW-NEXT:    movl %edi, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    shll %cl, %esi
-; X86-SLOW-NEXT:    shrl %edx
-; X86-SLOW-NEXT:    movb %ch, %cl
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    orl %esi, %edx
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movb %al, %cl
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
+; X86-SLOW-NEXT:    movl %ebx, %edi
 ; X86-SLOW-NEXT:    shrl %edi
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movl %ecx, %ebx
+; X86-SLOW-NEXT:    notb %bl
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    orl %esi, %edi
-; X86-SLOW-NEXT:    movb %al, %cl
-; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-SLOW-NEXT:    movl %esi, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %edx
+; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edx
+; X86-SLOW-NEXT:    orl %eax, %edx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    movl %ebx, %eax
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    shll %cl, %eax
+; X86-SLOW-NEXT:    shrl %esi
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT:    shrl %cl, %esi
+; X86-SLOW-NEXT:    orl %eax, %esi
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
+; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    shll %cl, %eax
 ; X86-SLOW-NEXT:    shrl %ebx
-; X86-SLOW-NEXT:    movb %ch, %cl
+; X86-SLOW-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-SLOW-NEXT:    shrl %cl, %ebx
 ; X86-SLOW-NEXT:    orl %eax, %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SLOW-NEXT:    movl 8(%ebp), %eax
 ; X86-SLOW-NEXT:    movl %ebx, 12(%eax)
-; X86-SLOW-NEXT:    movl %edi, 8(%eax)
+; X86-SLOW-NEXT:    movl %esi, 8(%eax)
 ; X86-SLOW-NEXT:    movl %edx, 4(%eax)
-; X86-SLOW-NEXT:    movl %ebp, (%eax)
-; X86-SLOW-NEXT:    addl $4, %esp
+; X86-SLOW-NEXT:    movl %edi, (%eax)
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 4340f8fd484a..544ab7fc7737 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -258,51 +258,53 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-FAST-LABEL: var_shift_i128:
 ; X86-FAST:       # %bb.0:
 ; X86-FAST-NEXT:    pushl %ebp
+; X86-FAST-NEXT:    movl %esp, %ebp
 ; X86-FAST-NEXT:    pushl %ebx
 ; X86-FAST-NEXT:    pushl %edi
 ; X86-FAST-NEXT:    pushl %esi
-; X86-FAST-NEXT:    pushl %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-FAST-NEXT:    andl $-16, %esp
+; X86-FAST-NEXT:    subl $16, %esp
+; X86-FAST-NEXT:    movl 24(%ebp), %esi
+; X86-FAST-NEXT:    movl 28(%ebp), %eax
+; X86-FAST-NEXT:    movl 48(%ebp), %edx
+; X86-FAST-NEXT:    movl 56(%ebp), %ecx
 ; X86-FAST-NEXT:    testb $64, %cl
+; X86-FAST-NEXT:    movl 52(%ebp), %ebx
 ; X86-FAST-NEXT:    je .LBB6_1
 ; X86-FAST-NEXT:  # %bb.2:
-; X86-FAST-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl %edi, %edx
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-FAST-NEXT:    movl %esi, %ebp
-; X86-FAST-NEXT:    movl %ebx, %esi
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-FAST-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl %esi, %edx
+; X86-FAST-NEXT:    movl 32(%ebp), %esi
+; X86-FAST-NEXT:    movl %ebx, %edi
+; X86-FAST-NEXT:    movl %eax, %ebx
+; X86-FAST-NEXT:    movl 36(%ebp), %eax
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    je .LBB6_4
 ; X86-FAST-NEXT:    jmp .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_1:
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-FAST-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-FAST-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-FAST-NEXT:    movl 40(%ebp), %edi
+; X86-FAST-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-FAST-NEXT:    movl 44(%ebp), %edi
 ; X86-FAST-NEXT:    testb $32, %cl
 ; X86-FAST-NEXT:    jne .LBB6_5
 ; X86-FAST-NEXT:  .LBB6_4:
-; X86-FAST-NEXT:    movl %edi, %ebx
-; X86-FAST-NEXT:    movl %esi, %edi
-; X86-FAST-NEXT:    movl %edx, %esi
-; X86-FAST-NEXT:    movl %ebp, %edx
-; X86-FAST-NEXT:    movl (%esp), %ebp # 4-byte Reload
+; X86-FAST-NEXT:    movl %esi, %eax
+; X86-FAST-NEXT:    movl %ebx, %esi
+; X86-FAST-NEXT:    movl %edx, %ebx
+; X86-FAST-NEXT:    movl %edi, %edx
+; X86-FAST-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-FAST-NEXT:  .LBB6_5:
-; X86-FAST-NEXT:    shrdl %cl, %edx, %ebp
-; X86-FAST-NEXT:    shrdl %cl, %esi, %edx
-; X86-FAST-NEXT:    shrdl %cl, %edi, %esi
+; X86-FAST-NEXT:    shrdl %cl, %edx, %edi
+; X86-FAST-NEXT:    shrdl %cl, %ebx, %edx
+; X86-FAST-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-FAST-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-FAST-NEXT:    shrdl %cl, %ebx, %edi
-; X86-FAST-NEXT:    movl %edi, 12(%eax)
-; X86-FAST-NEXT:    movl %esi, 8(%eax)
+; X86-FAST-NEXT:    shrdl %cl, %eax, %esi
+; X86-FAST-NEXT:    movl 8(%ebp), %eax
+; X86-FAST-NEXT:    movl %esi, 12(%eax)
+; X86-FAST-NEXT:    movl %ebx, 8(%eax)
 ; X86-FAST-NEXT:    movl %edx, 4(%eax)
-; X86-FAST-NEXT:    movl %ebp, (%eax)
-; X86-FAST-NEXT:    addl $4, %esp
+; X86-FAST-NEXT:    movl %edi, (%eax)
+; X86-FAST-NEXT:    leal -12(%ebp), %esp
 ; X86-FAST-NEXT:    popl %esi
 ; X86-FAST-NEXT:    popl %edi
 ; X86-FAST-NEXT:    popl %ebx
@@ -312,78 +314,88 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SLOW-LABEL: var_shift_i128:
 ; X86-SLOW:       # %bb.0:
 ; X86-SLOW-NEXT:    pushl %ebp
+; X86-SLOW-NEXT:    movl %esp, %ebp
 ; X86-SLOW-NEXT:    pushl %ebx
 ; X86-SLOW-NEXT:    pushl %edi
 ; X86-SLOW-NEXT:    pushl %esi
-; X86-SLOW-NEXT:    subl $8, %esp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SLOW-NEXT:    testb $64, %cl
+; X86-SLOW-NEXT:    andl $-16, %esp
+; X86-SLOW-NEXT:    subl $16, %esp
+; X86-SLOW-NEXT:    movl 24(%ebp), %edx
+; X86-SLOW-NEXT:    movl 28(%ebp), %esi
+; X86-SLOW-NEXT:    movl 48(%ebp), %ebx
+; X86-SLOW-NEXT:    movl 56(%ebp), %eax
+; X86-SLOW-NEXT:    testb $64, %al
+; X86-SLOW-NEXT:    movl 52(%ebp), %edi
 ; X86-SLOW-NEXT:    je .LBB6_1
 ; X86-SLOW-NEXT:  # %bb.2:
-; X86-SLOW-NEXT:    movl %ebp, %eax
-; X86-SLOW-NEXT:    movl %ebx, %ebp
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SLOW-NEXT:    movl %edi, %edx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, %ebx
+; X86-SLOW-NEXT:    movl 32(%ebp), %edx
+; X86-SLOW-NEXT:    movl %edi, %eax
 ; X86-SLOW-NEXT:    movl %esi, %edi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SLOW-NEXT:    testb $32, %cl
-; X86-SLOW-NEXT:    jne .LBB6_5
-; X86-SLOW-NEXT:  .LBB6_4:
-; X86-SLOW-NEXT:    movl %ebx, %esi
-; X86-SLOW-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT:    movl %ebp, %edi
-; X86-SLOW-NEXT:    movl %edx, %ebp
-; X86-SLOW-NEXT:    movl %eax, %edx
-; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:    movl 36(%ebp), %esi
+; X86-SLOW-NEXT:    jmp .LBB6_3
 ; X86-SLOW-NEXT:  .LBB6_1:
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-SLOW-NEXT:    movl 40(%ebp), %eax
+; X86-SLOW-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 44(%ebp), %eax
+; X86-SLOW-NEXT:  .LBB6_3:
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    testb $32, %cl
 ; X86-SLOW-NEXT:    je .LBB6_4
-; X86-SLOW-NEXT:  .LBB6_5:
-; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:  # %bb.5:
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    jmp .LBB6_6
+; X86-SLOW-NEXT:  .LBB6_4:
+; X86-SLOW-NEXT:    movl %edx, %esi
+; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl %eax, %ebx
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:  .LBB6_6:
-; X86-SLOW-NEXT:    shrl %cl, %edx
-; X86-SLOW-NEXT:    movl %ecx, %ebx
-; X86-SLOW-NEXT:    notb %bl
-; X86-SLOW-NEXT:    leal (%ebp,%ebp), %eax
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %eax
-; X86-SLOW-NEXT:    orl %edx, %eax
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %eax
+; X86-SLOW-NEXT:    movl %eax, %edx
+; X86-SLOW-NEXT:    movl %ecx, %eax
+; X86-SLOW-NEXT:    notb %al
+; X86-SLOW-NEXT:    movl %ebx, %edi
+; X86-SLOW-NEXT:    addl %ebx, %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl %edx, %ebx
+; X86-SLOW-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %ebp
-; X86-SLOW-NEXT:    leal (%edi,%edi), %edx
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    shrl %cl, %edi
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%ebx,%ebx), %edx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %edx
-; X86-SLOW-NEXT:    orl %ebp, %edx
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    orl %edi, %edx
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT:    shrl %cl, %edi
-; X86-SLOW-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT:    movl (%esp), %edi # 4-byte Reload
-; X86-SLOW-NEXT:    leal (%edi,%edi), %ebp
-; X86-SLOW-NEXT:    movl %ebx, %ecx
-; X86-SLOW-NEXT:    shll %cl, %ebp
-; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    shrl %cl, %ebx
+; X86-SLOW-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT:    leal (%edi,%edi), %ebx
+; X86-SLOW-NEXT:    movl %eax, %ecx
+; X86-SLOW-NEXT:    shll %cl, %ebx
+; X86-SLOW-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT:    movl 56(%ebp), %ecx
 ; X86-SLOW-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SLOW-NEXT:    shrl %cl, %edi
 ; X86-SLOW-NEXT:    addl %esi, %esi
-; X86-SLOW-NEXT:    movl %ebx, %ecx
+; X86-SLOW-NEXT:    movl %eax, %ecx
 ; X86-SLOW-NEXT:    shll %cl, %esi
 ; X86-SLOW-NEXT:    orl %edi, %esi
-; X86-SLOW-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SLOW-NEXT:    movl 8(%ebp), %ecx
 ; X86-SLOW-NEXT:    movl %esi, 12(%ecx)
-; X86-SLOW-NEXT:    movl %ebp, 8(%ecx)
+; X86-SLOW-NEXT:    movl %ebx, 8(%ecx)
 ; X86-SLOW-NEXT:    movl %edx, 4(%ecx)
+; X86-SLOW-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-SLOW-NEXT:    movl %eax, (%ecx)
 ; X86-SLOW-NEXT:    movl %ecx, %eax
-; X86-SLOW-NEXT:    addl $8, %esp
+; X86-SLOW-NEXT:    leal -12(%ebp), %esp
 ; X86-SLOW-NEXT:    popl %esi
 ; X86-SLOW-NEXT:    popl %edi
 ; X86-SLOW-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78f9af3..df97f49440f7 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -74,43 +74,57 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
 ; X86-SSE2-LABEL: fshl_i128:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    pushl %ebx
 ; X86-SSE2-NEXT:    pushl %edi
 ; X86-SSE2-NEXT:    pushl %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 48(%ebp), %edi
+; X86-SSE2-NEXT:    movl 52(%ebp), %eax
+; X86-SSE2-NEXT:    movl 24(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $64, %cl
-; X86-SSE2-NEXT:    movl %esi, %eax
-; X86-SSE2-NEXT:    cmovnel %ebx, %eax
-; X86-SSE2-NEXT:    movl %edx, %ebp
-; X86-SSE2-NEXT:    cmovnel %edi, %ebp
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %edi
-; X86-SSE2-NEXT:    cmovnel {{[0-9]+}}(%esp), %ebx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edx
-; X86-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X86-SSE2-NEXT:    movl %edx, %ecx
+; X86-SSE2-NEXT:    cmovnel %edi, %ecx
+; X86-SSE2-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 28(%ebp), %esi
+; X86-SSE2-NEXT:    movl %esi, %ebx
+; X86-SSE2-NEXT:    cmovnel %eax, %ebx
+; X86-SSE2-NEXT:    cmovnel 44(%ebp), %eax
+; X86-SSE2-NEXT:    cmovnel 40(%ebp), %edi
+; X86-SSE2-NEXT:    cmovel 36(%ebp), %esi
+; X86-SSE2-NEXT:    cmovel 32(%ebp), %edx
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    testb $32, %cl
-; X86-SSE2-NEXT:    cmovnel %esi, %edx
-; X86-SSE2-NEXT:    cmovnel %ebp, %esi
-; X86-SSE2-NEXT:    cmovnel %eax, %ebp
-; X86-SSE2-NEXT:    cmovel %edi, %ebx
+; X86-SSE2-NEXT:    cmovnel %edx, %esi
+; X86-SSE2-NEXT:    cmovnel %ebx, %edx
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    cmovnel %ecx, %ebx
 ; X86-SSE2-NEXT:    cmovel %eax, %edi
-; X86-SSE2-NEXT:    movl %edi, %eax
-; X86-SSE2-NEXT:    shldl %cl, %ebx, %eax
-; X86-SSE2-NEXT:    movl %ebp, %ebx
-; X86-SSE2-NEXT:    shldl %cl, %edi, %ebx
-; X86-SSE2-NEXT:    movl %esi, %edi
-; X86-SSE2-NEXT:    shldl %cl, %ebp, %edi
+; X86-SSE2-NEXT:    cmovel %ecx, %eax
+; X86-SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
 ; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SSE2-NEXT:    shldl %cl, %esi, %edx
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    movl %edx, 12(%ecx)
-; X86-SSE2-NEXT:    movl %edi, 8(%ecx)
-; X86-SSE2-NEXT:    movl %ebx, 4(%ecx)
-; X86-SSE2-NEXT:    movl %eax, (%ecx)
-; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-SSE2-NEXT:    movl %ebx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %eax, %edi
+; X86-SSE2-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SSE2-NEXT:    movl %edx, %edi
+; X86-SSE2-NEXT:    movl 56(%ebp), %ecx
+; X86-SSE2-NEXT:    shldl %cl, %ebx, %edi
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
+; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    shldl %cl, %edx, %esi
+; X86-SSE2-NEXT:    movl %esi, 12(%eax)
+; X86-SSE2-NEXT:    movl %edi, 8(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
+; X86-SSE2-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SSE2-NEXT:    movl %ecx, (%eax)
+; X86-SSE2-NEXT:    leal -12(%ebp), %esp
 ; X86-SSE2-NEXT:    popl %esi
 ; X86-SSE2-NEXT:    popl %edi
 ; X86-SSE2-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/i128-add.ll b/llvm/test/CodeGen/X86/i128-add.ll
index 2849e448a053..b4546c1e983c 100644
--- a/llvm/test/CodeGen/X86/i128-add.ll
+++ b/llvm/test/CodeGen/X86/i128-add.ll
@@ -5,17 +5,20 @@
 define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: add_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    addl 40(%ebp), %esi
+; X86-NEXT:    adcl 44(%ebp), %edi
+; X86-NEXT:    adcl 48(%ebp), %ecx
+; X86-NEXT:    adcl 52(%ebp), %edx
 ; X86-NEXT:    addl $1, %esi
 ; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %ecx
@@ -24,8 +27,10 @@ define i128 @add_i128(i128 %x, i128 %y) nounwind {
 ; X86-NEXT:    movl %esi, (%eax)
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: add_i128:
diff --git a/llvm/test/CodeGen/X86/i128-fp128-abi.ll b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
index 4152dcf07f7e..2174d5056e6c 100644
--- a/llvm/test/CodeGen/X86/i128-fp128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-fp128-abi.ll
@@ -55,41 +55,47 @@ define void @store(PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 20(%esp), %ecx
+; CHECK-X86-NEXT:    movl 24(%esp), %edx
+; CHECK-X86-NEXT:    movl 28(%esp), %esi
+; CHECK-X86-NEXT:    movl 32(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
 }
 
 ; Illustrate stack alignment
-; FIXME(#77401): alignment on x86-32 is ABI-incorrect.
 define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X64-F128-LABEL: store_perturbed:
 ; CHECK-X64-F128:       # %bb.0:
@@ -130,34 +136,41 @@ define void @store_perturbed(i8 %_0, PrimTy %x, ptr %p) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 16(%esp), %eax
-; CHECK-X86-NEXT:    movl 20(%esp), %ecx
-; CHECK-X86-NEXT:    movl 24(%esp), %edx
-; CHECK-X86-NEXT:    movl 28(%esp), %esi
-; CHECK-X86-NEXT:    movl 32(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %eax
+; CHECK-X86-NEXT:    movl 36(%esp), %ecx
+; CHECK-X86-NEXT:    movl 40(%esp), %edx
+; CHECK-X86-NEXT:    movl 44(%esp), %esi
+; CHECK-X86-NEXT:    movl 48(%esp), %edi
 ; CHECK-X86-NEXT:    movl %esi, 12(%edi)
 ; CHECK-X86-NEXT:    movl %edx, 8(%edi)
 ; CHECK-X86-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-X86-NEXT:    movl %eax, (%edi)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: store_perturbed:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 32(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 40(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %esi, 12(%edi)
 ; CHECK-MSVC32-NEXT:    movl %edx, 8(%edi)
 ; CHECK-MSVC32-NEXT:    movl %ecx, 4(%edi)
 ; CHECK-MSVC32-NEXT:    movl %eax, (%edi)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   store PrimTy %x, ptr %p
   ret void
@@ -271,34 +284,41 @@ define PrimTy @first_arg(PrimTy %x) nounwind {
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 16(%esp), %ecx
-; CHECK-X86-NEXT:    movl 20(%esp), %edx
-; CHECK-X86-NEXT:    movl 24(%esp), %esi
-; CHECK-X86-NEXT:    movl 28(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 32(%esp), %ecx
+; CHECK-X86-NEXT:    movl 36(%esp), %edx
+; CHECK-X86-NEXT:    movl 40(%esp), %esi
+; CHECK-X86-NEXT:    movl 44(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: first_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 16(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 20(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 24(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 28(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 24(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 28(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 32(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 36(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -344,34 +364,41 @@ define PrimTy @leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, PrimTy %x) nounw
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 48(%esp), %ecx
-; CHECK-X86-NEXT:    movl 52(%esp), %edx
-; CHECK-X86-NEXT:    movl 56(%esp), %esi
-; CHECK-X86-NEXT:    movl 60(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 48(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 52(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -417,34 +444,41 @@ define PrimTy @many_leading_args(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, Pr
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 72(%esp), %ecx
-; CHECK-X86-NEXT:    movl 76(%esp), %edx
-; CHECK-X86-NEXT:    movl 80(%esp), %esi
-; CHECK-X86-NEXT:    movl 84(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 80(%esp), %ecx
+; CHECK-X86-NEXT:    movl 84(%esp), %edx
+; CHECK-X86-NEXT:    movl 88(%esp), %esi
+; CHECK-X86-NEXT:    movl 92(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 72(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 76(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 80(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 84(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 72(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 76(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 80(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 84(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -488,34 +522,41 @@ define PrimTy @trailing_arg(i64 %_0, i64 %_1, i64 %_2, i64 %_3, i64 %_4, PrimTy
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    pushl %edi
 ; CHECK-X86-NEXT:    pushl %esi
-; CHECK-X86-NEXT:    movl 12(%esp), %eax
-; CHECK-X86-NEXT:    movl 56(%esp), %ecx
-; CHECK-X86-NEXT:    movl 60(%esp), %edx
-; CHECK-X86-NEXT:    movl 64(%esp), %esi
-; CHECK-X86-NEXT:    movl 68(%esp), %edi
+; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    movl 16(%esp), %eax
+; CHECK-X86-NEXT:    movl 64(%esp), %ecx
+; CHECK-X86-NEXT:    movl 68(%esp), %edx
+; CHECK-X86-NEXT:    movl 72(%esp), %esi
+; CHECK-X86-NEXT:    movl 76(%esp), %edi
 ; CHECK-X86-NEXT:    movl %edi, 12(%eax)
 ; CHECK-X86-NEXT:    movl %esi, 8(%eax)
 ; CHECK-X86-NEXT:    movl %edx, 4(%eax)
 ; CHECK-X86-NEXT:    movl %ecx, (%eax)
+; CHECK-X86-NEXT:    addl $4, %esp
 ; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    popl %edi
 ; CHECK-X86-NEXT:    retl $4
 ;
 ; CHECK-MSVC32-LABEL: trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
+; CHECK-MSVC32-NEXT:    pushl %ebp
+; CHECK-MSVC32-NEXT:    movl %esp, %ebp
 ; CHECK-MSVC32-NEXT:    pushl %edi
 ; CHECK-MSVC32-NEXT:    pushl %esi
-; CHECK-MSVC32-NEXT:    movl 12(%esp), %eax
-; CHECK-MSVC32-NEXT:    movl 56(%esp), %ecx
-; CHECK-MSVC32-NEXT:    movl 60(%esp), %edx
-; CHECK-MSVC32-NEXT:    movl 64(%esp), %esi
-; CHECK-MSVC32-NEXT:    movl 68(%esp), %edi
+; CHECK-MSVC32-NEXT:    andl $-16, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 56(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 60(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 64(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl 68(%ebp), %edi
 ; CHECK-MSVC32-NEXT:    movl %edi, 12(%eax)
 ; CHECK-MSVC32-NEXT:    movl %esi, 8(%eax)
 ; CHECK-MSVC32-NEXT:    movl %edx, 4(%eax)
 ; CHECK-MSVC32-NEXT:    movl %ecx, (%eax)
+; CHECK-MSVC32-NEXT:    leal -8(%ebp), %esp
 ; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %edi
+; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   ret PrimTy %x
 }
@@ -571,32 +612,43 @@ define void @call_first_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_first_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $56, %esp
+; CHECK-X86-NEXT:    movl 64(%esp), %eax
+; CHECK-X86-NEXT:    movl 68(%esp), %ecx
+; CHECK-X86-NEXT:    movl 72(%esp), %edx
+; CHECK-X86-NEXT:    movl 76(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 28(%esp)
+; CHECK-X86-NEXT:    movl %edx, 24(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 20(%esp)
+; CHECK-X86-NEXT:    movl %eax, 16(%esp)
+; CHECK-X86-NEXT:    leal 32(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
 ; CHECK-X86-NEXT:    calll first_arg@PLT
-; CHECK-X86-NEXT:    addl $56, %esp
+; CHECK-X86-NEXT:    addl $52, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_first_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $64, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 16(%esp)
+; CHECK-MSVC32-NEXT:    leal 32(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
 ; CHECK-MSVC32-NEXT:    calll _first_arg
-; CHECK-MSVC32-NEXT:    addl $20, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @first_arg(PrimTy %x)
@@ -686,48 +738,59 @@ define void @call_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll leading_args@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _leading_args
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
@@ -836,56 +899,67 @@ define void @call_many_leading_args(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_many_leading_args:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $104, %esp
+; CHECK-X86-NEXT:    movl 112(%esp), %eax
+; CHECK-X86-NEXT:    movl 116(%esp), %ecx
+; CHECK-X86-NEXT:    movl 120(%esp), %edx
+; CHECK-X86-NEXT:    movl 124(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 76(%esp)
+; CHECK-X86-NEXT:    movl %edx, 72(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 68(%esp)
+; CHECK-X86-NEXT:    movl %eax, 64(%esp)
+; CHECK-X86-NEXT:    leal 80(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 60(%esp)
+; CHECK-X86-NEXT:    movl $0, 56(%esp)
+; CHECK-X86-NEXT:    movl $0, 52(%esp)
+; CHECK-X86-NEXT:    movl $0, 48(%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll many_leading_args@PLT
-; CHECK-X86-NEXT:    addl $104, %esp
+; CHECK-X86-NEXT:    addl $100, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_many_leading_args:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $112, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 76(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 72(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 68(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 64(%esp)
+; CHECK-MSVC32-NEXT:    leal 80(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 48(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _many_leading_args
-; CHECK-MSVC32-NEXT:    addl $68, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @many_leading_args(i64 0, i64 0, i64 0, i64 0, PrimTy Prim0, PrimTy %x)
@@ -975,48 +1049,59 @@ define void @call_trailing_arg(PrimTy %x) nounwind {
 ;
 ; CHECK-X86-LABEL: call_trailing_arg:
 ; CHECK-X86:       # %bb.0:
-; CHECK-X86-NEXT:    subl $40, %esp
-; CHECK-X86-NEXT:    leal 12(%esp), %eax
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl 56(%esp)
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl $0
-; CHECK-X86-NEXT:    pushl %eax
+; CHECK-X86-NEXT:    pushl %esi
+; CHECK-X86-NEXT:    subl $88, %esp
+; CHECK-X86-NEXT:    movl 96(%esp), %eax
+; CHECK-X86-NEXT:    movl 100(%esp), %ecx
+; CHECK-X86-NEXT:    movl 104(%esp), %edx
+; CHECK-X86-NEXT:    movl 108(%esp), %esi
+; CHECK-X86-NEXT:    movl %esi, 60(%esp)
+; CHECK-X86-NEXT:    movl %edx, 56(%esp)
+; CHECK-X86-NEXT:    movl %ecx, 52(%esp)
+; CHECK-X86-NEXT:    movl %eax, 48(%esp)
+; CHECK-X86-NEXT:    leal 64(%esp), %eax
+; CHECK-X86-NEXT:    movl %eax, (%esp)
+; CHECK-X86-NEXT:    movl $0, 32(%esp)
+; CHECK-X86-NEXT:    movl $0, 28(%esp)
+; CHECK-X86-NEXT:    movl $0, 24(%esp)
+; CHECK-X86-NEXT:    movl $0, 20(%esp)
+; CHECK-X86-NEXT:    movl $0, 16(%esp)
+; CHECK-X86-NEXT:    movl $0, 12(%esp)
+; CHECK-X86-NEXT:    movl $0, 8(%esp)
+; CHECK-X86-NEXT:    movl $0, 4(%esp)
 ; CHECK-X86-NEXT:    calll trailing_arg@PLT
-; CHECK-X86-NEXT:    addl $88, %esp
+; CHECK-X86-NEXT:    addl $84, %esp
+; CHECK-X86-NEXT:    popl %esi
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-MSVC32-LABEL: call_trailing_arg:
 ; CHECK-MSVC32:       # %bb.0:
 ; CHECK-MSVC32-NEXT:    pushl %ebp
 ; CHECK-MSVC32-NEXT:    movl %esp, %ebp
+; CHECK-MSVC32-NEXT:    pushl %esi
 ; CHECK-MSVC32-NEXT:    andl $-16, %esp
-; CHECK-MSVC32-NEXT:    subl $32, %esp
-; CHECK-MSVC32-NEXT:    movl %esp, %eax
-; CHECK-MSVC32-NEXT:    pushl 20(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 16(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 12(%ebp)
-; CHECK-MSVC32-NEXT:    pushl 8(%ebp)
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl $0
-; CHECK-MSVC32-NEXT:    pushl %eax
+; CHECK-MSVC32-NEXT:    subl $96, %esp
+; CHECK-MSVC32-NEXT:    movl 8(%ebp), %eax
+; CHECK-MSVC32-NEXT:    movl 12(%ebp), %ecx
+; CHECK-MSVC32-NEXT:    movl 16(%ebp), %edx
+; CHECK-MSVC32-NEXT:    movl 20(%ebp), %esi
+; CHECK-MSVC32-NEXT:    movl %esi, 60(%esp)
+; CHECK-MSVC32-NEXT:    movl %edx, 56(%esp)
+; CHECK-MSVC32-NEXT:    movl %ecx, 52(%esp)
+; CHECK-MSVC32-NEXT:    movl %eax, 48(%esp)
+; CHECK-MSVC32-NEXT:    leal 64(%esp), %eax
+; CHECK-MSVC32-NEXT:    movl %eax, (%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 32(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 28(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 24(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 20(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 16(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 12(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 8(%esp)
+; CHECK-MSVC32-NEXT:    movl $0, 4(%esp)
 ; CHECK-MSVC32-NEXT:    calll _trailing_arg
-; CHECK-MSVC32-NEXT:    addl $52, %esp
-; CHECK-MSVC32-NEXT:    movl %ebp, %esp
+; CHECK-MSVC32-NEXT:    leal -4(%ebp), %esp
+; CHECK-MSVC32-NEXT:    popl %esi
 ; CHECK-MSVC32-NEXT:    popl %ebp
 ; CHECK-MSVC32-NEXT:    retl
   call PrimTy @trailing_arg(i64 0, i64 0, i64 0, i64 0, PrimTy %x)
diff --git a/llvm/test/CodeGen/X86/i128-sdiv.ll b/llvm/test/CodeGen/X86/i128-sdiv.ll
index 717f52f198ee..7d5757392c98 100644
--- a/llvm/test/CodeGen/X86/i128-sdiv.ll
+++ b/llvm/test/CodeGen/X86/i128-sdiv.ll
@@ -8,18 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %esi
 ; X86-NEXT:    sarl $31, %esi
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl $30, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    adcl 28(%ebp), %esi
+; X86-NEXT:    adcl 32(%ebp), %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    shrdl $2, %ecx, %edx
 ; X86-NEXT:    movl %ecx, %esi
@@ -29,8 +32,10 @@ define i128 @test1(i128 %x) nounwind {
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl %esi, 4(%eax)
 ; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
@@ -52,38 +57,44 @@ define i128 @test1(i128 %x) nounwind {
 define i128 @test2(i128 %x) nounwind {
 ; X86-LABEL: test2:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %ecx
 ; X86-NEXT:    shrl $30, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    addl %esi, %edi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    adcl $0, %edx
-; X86-NEXT:    shrdl $2, %edx, %ecx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    addl %edx, %esi
+; X86-NEXT:    adcl 28(%ebp), %edx
+; X86-NEXT:    adcl 32(%ebp), %ecx
+; X86-NEXT:    adcl $0, %eax
+; X86-NEXT:    shrdl $2, %eax, %ecx
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    sarl $2, %edx
-; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    sarl $2, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl $0, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %edx
+; X86-NEXT:    sbbl %esi, %ebx
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %esi, %edi
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test2:
diff --git a/llvm/test/CodeGen/X86/i128-udiv.ll b/llvm/test/CodeGen/X86/i128-udiv.ll
index 3f890b7f2443..901183242132 100644
--- a/llvm/test/CodeGen/X86/i128-udiv.ll
+++ b/llvm/test/CodeGen/X86/i128-udiv.ll
@@ -8,15 +8,21 @@
 define i128 @test1(i128 %x) nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %edx
 ; X86-NEXT:    shrdl $2, %edx, %ecx
 ; X86-NEXT:    shrl $2, %edx
 ; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    movl $0, 12(%eax)
 ; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/iabs.ll b/llvm/test/CodeGen/X86/iabs.ll
index 55c318e87a5a..bdceeefbcfab 100644
--- a/llvm/test/CodeGen/X86/iabs.ll
+++ b/llvm/test/CodeGen/X86/iabs.ll
@@ -123,31 +123,34 @@ define i64 @test_i64(i64 %a) nounwind {
 define i128 @test_i128(i128 %a) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    xorl %edx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %edx, %ebx
-; X86-NEXT:    subl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebx, (%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    xorl %eax, %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    xorl %eax, %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    xorl %eax, %esi
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    subl %eax, %edi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: test_i128:
diff --git a/llvm/test/CodeGen/X86/icmp-shift-opt.ll b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
index c52b3ed6c926..4a6c1d0ae5de 100644
--- a/llvm/test/CodeGen/X86/icmp-shift-opt.ll
+++ b/llvm/test/CodeGen/X86/icmp-shift-opt.ll
@@ -10,33 +10,39 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_lt_power_of_2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %edi
+; X86-NEXT:    movl 24(%ebp), %esi
 ; X86-NEXT:    .p2align 4
 ; X86-NEXT:  .LBB0_1: # %loop
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
-; X86-NEXT:    addl $1, %edi
-; X86-NEXT:    adcl $0, %esi
+; X86-NEXT:    addl $1, %esi
+; X86-NEXT:    adcl $0, %edi
 ; X86-NEXT:    adcl $0, %edx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    orl %edx, %ebp
-; X86-NEXT:    orl %ecx, %ebp
-; X86-NEXT:    shrdl $28, %ebx, %ebp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shrdl $28, %ebx, %esi
+; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    jne .LBB0_1
 ; X86-NEXT:  # %bb.2: # %exit
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %edx, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -73,15 +79,21 @@ exit:
 define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_eq_zero:
@@ -98,15 +110,21 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_srl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    orl 20(%ebp), %ecx
+; X86-NEXT:    movl 8(%ebp), %edx
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    orl %ecx, %edx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    shldl $15, %edx, %ecx
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_srl_ne_zero:
@@ -123,13 +141,19 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero:
@@ -146,13 +170,19 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_ne_zero:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
 ; X86-NEXT:    shll $17, %ecx
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl 8(%ebp), %eax
+; X86-NEXT:    orl 12(%ebp), %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    setne %al
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_ne_zero:
@@ -170,13 +200,17 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
 define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
 ; X86-NEXT:    shldl $17, %edx, %esi
 ; X86-NEXT:    shldl $17, %ecx, %edx
 ; X86-NEXT:    shldl $17, %eax, %ecx
@@ -194,9 +228,11 @@ define i1 @opt_setcc_shl_eq_zero_multiple_shl_users(i128 %a) nounwind {
 ; X86-NEXT:    calll use@PLT
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: opt_setcc_shl_eq_zero_multiple_shl_users:
diff --git a/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
new file mode 100644
index 000000000000..e272e7ee3cb0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/late-tail-dup-computed-goto.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=tailduplication -tail-dup-pred-size=1 -tail-dup-succ-size=1 %s -o - | FileCheck %s
+#
+# Check that only the computed gotos are duplicated aggressively.
+#
+--- |
+  @computed_goto.dispatch = constant [5 x ptr] [ptr null, ptr blockaddress(@computed_goto, %bb1), ptr blockaddress(@computed_goto, %bb2), ptr blockaddress(@computed_goto, %bb3), ptr blockaddress(@computed_goto, %bb4)]
+  declare i64 @f0()
+  declare i64 @f1()
+  declare i64 @f2()
+  declare i64 @f3()
+  declare i64 @f4()
+  declare i64 @f5()
+  define void @computed_goto() {
+    start:
+      ret void
+    bb1:
+      ret void
+    bb2:
+      ret void
+    bb3:
+      ret void
+    bb4:
+      ret void
+  }
+  define void @jump_table() { ret void }
+  define void @jump_table_pic() { ret void }
+...
+---
+name:            computed_goto
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: computed_goto
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY3]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY4]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY5]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+  ; CHECK-NEXT:   successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:gr64 = COPY $rax
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gr64_nosp = COPY [[COPY6]]
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gr64_nosp = COPY [[COPY1]]
+  ; CHECK-NEXT:   JMP64m $noreg, 8, [[COPY2]], @computed_goto.dispatch, $noreg
+  bb.0:
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f0, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %0:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %0
+    JMP_1 %bb.5
+
+  bb.1.bb1 (ir-block-address-taken %ir-block.bb1):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f1, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %1:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %1
+    JMP_1 %bb.5
+
+  bb.2.bb2 (ir-block-address-taken %ir-block.bb2):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f2, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %2:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %2
+    JMP_1 %bb.5
+
+  bb.3.bb3 (ir-block-address-taken %ir-block.bb3):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f3, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %3:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %3
+    JMP_1 %bb.5
+
+  bb.4.bb4 (ir-block-address-taken %ir-block.bb4):
+    successors: %bb.5(0x80000000)
+
+    CALL64pcrel32 target-flags(x86-plt) @f4, csr_64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp, implicit-def $rax
+    %4:gr64 = COPY $rax
+    %6:gr64_nosp = COPY %4
+
+  bb.5:
+    successors: %bb.1(0x20000000), %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000)
+
+    %5:gr64_nosp = COPY %6
+    JMP64m $noreg, 8, %5, @computed_goto.dispatch, $noreg
+...
diff --git a/llvm/test/CodeGen/X86/mul128.ll b/llvm/test/CodeGen/X86/mul128.ll
index fc1cc1f65627..e10e48f9aea0 100644
--- a/llvm/test/CodeGen/X86/mul128.ll
+++ b/llvm/test/CodeGen/X86/mul128.ll
@@ -18,85 +18,80 @@ define i128 @foo(i128 %t, i128 %u) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    pushl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 28
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
-; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    imull %ecx, %ebp
-; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl 40(%ebp), %edi
+; X86-NEXT:    movl 44(%ebp), %esi
+; X86-NEXT:    imull %ecx, %esi
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    mull %ecx
-; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    addl %ebp, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull %esi, %eax
+; X86-NEXT:    addl %esi, %ebx
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    imull %edi, %eax
 ; X86-NEXT:    addl %eax, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    imull %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    mull %ebp
-; X86-NEXT:    addl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    imull %ebp, %esi
+; X86-NEXT:    movl 48(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    imull 28(%ebp), %ecx
+; X86-NEXT:    movl 24(%ebp), %edi
+; X86-NEXT:    mull %edi
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    movl 52(%ebp), %esi
+; X86-NEXT:    imull %edi, %esi
 ; X86-NEXT:    addl %edx, %esi
-; X86-NEXT:    addl %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    adcl %ebx, %esi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    mull %edi
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull %ecx
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addl %ebx, %ecx
 ; X86-NEXT:    adcl $0, %edi
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    addl %ecx, %ebx
-; X86-NEXT:    adcl %edi, %ebp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    adcl %edi, %ebx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    addl %ebp, %eax
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    mull 44(%ebp)
+; X86-NEXT:    addl %ebx, %eax
 ; X86-NEXT:    movzbl %cl, %ecx
 ; X86-NEXT:    adcl %ecx, %edx
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %esi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ebx, 4(%ecx)
-; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, 4(%ecx)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    movl %esi, (%ecx)
 ; X86-NEXT:    movl %eax, 8(%ecx)
 ; X86-NEXT:    movl %edx, 12(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $8, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    .cfi_def_cfa_offset 12
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    .cfi_def_cfa_offset 8
 ; X86-NEXT:    popl %ebp
-; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl $4
   %k = mul i128 %t, %u
   ret i128 %k
diff --git a/llvm/test/CodeGen/X86/neg-abs.ll b/llvm/test/CodeGen/X86/neg-abs.ll
index 961205c50d97..724b2dc4c431 100644
--- a/llvm/test/CodeGen/X86/neg-abs.ll
+++ b/llvm/test/CodeGen/X86/neg-abs.ll
@@ -105,31 +105,35 @@ define i128 @neg_abs_i128(i128 %x) nounwind {
 ; X86-LABEL: neg_abs_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    movl 32(%ebp), %edx
 ; X86-NEXT:    xorl %ecx, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    xorl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    subl %ebx, %ebp
 ; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    subl %edi, %ebx
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl %ebp, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -259,37 +263,42 @@ define i64 @sub_abs_i64(i64 %x, i64 %y) nounwind {
 define i128 @sub_abs_i128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: sub_abs_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 36(%ebp), %eax
 ; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    sarl $31, %edx
 ; X86-NEXT:    xorl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 32(%ebp), %ecx
 ; X86-NEXT:    xorl %edx, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 28(%ebp), %esi
 ; X86-NEXT:    xorl %edx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 24(%ebp), %edi
 ; X86-NEXT:    xorl %edx, %edi
 ; X86-NEXT:    subl %edx, %edi
 ; X86-NEXT:    sbbl %edx, %esi
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl 40(%ebp), %edx
 ; X86-NEXT:    subl %edi, %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl 44(%ebp), %edi
 ; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl 48(%ebp), %esi
 ; X86-NEXT:    sbbl %ecx, %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %edx, (%eax)
 ; X86-NEXT:    movl %edi, 4(%eax)
 ; X86-NEXT:    movl %esi, 8(%eax)
 ; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: sub_abs_i128:
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 35c7c0e09f39..3004b8b72fcc 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -340,84 +340,87 @@ define i64 @cnt64(i64 %x) nounwind readnone {
 define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-NOSSE-LABEL: cnt128:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
 ; X86-NOSSE-NEXT:    movl $0, 12(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 8(%eax)
 ; X86-NOSSE-NEXT:    movl $0, 4(%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128:
@@ -462,20 +465,26 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-POPCNT-LABEL: cnt128:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
 ; X86-POPCNT-NEXT:    movl $0, 12(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 8(%eax)
 ; X86-POPCNT-NEXT:    movl $0, 4(%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128:
@@ -522,7 +531,11 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ;
 ; X86-SSE2-LABEL: cnt128:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -564,11 +577,17 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSE2-NEXT:    movl $0, 12(%eax)
 ; X86-SSE2-NEXT:    movl $0, 8(%eax)
 ; X86-SSE2-NEXT:    movl $0, 4(%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -600,6 +619,8 @@ define i128 @cnt128(i128 %x) nounwind readnone {
 ; X86-SSSE3-NEXT:    movl $0, 12(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 8(%eax)
 ; X86-SSSE3-NEXT:    movl $0, 4(%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -928,87 +949,92 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-NOSSE-LABEL: cnt128_optsize:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    subl $16, %esp
+; X86-NOSSE-NEXT:    movl 32(%ebp), %edx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %eax
+; X86-NOSSE-NEXT:    shrl %eax
+; X86-NOSSE-NEXT:    movl $1431655765, %ecx # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    subl %eax, %esi
 ; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %esi, %edi
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    movl $1431655765, %eax # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %eax, %esi
+; X86-NOSSE-NEXT:    subl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %esi
+; X86-NOSSE-NEXT:    andl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl $2, %edx
+; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %edx, %ebx
+; X86-NOSSE-NEXT:    movl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %edx, %edi
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edi
+; X86-NOSSE-NEXT:    andl %edx, %ebx
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %edi, %edx
+; X86-NOSSE-NEXT:    movl 28(%ebp), %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    subl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    andl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $2, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    movl %ebx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
+; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    andl %esi, %ebx
 ; X86-NOSSE-NEXT:    subl %ebx, %eax
 ; X86-NOSSE-NEXT:    movl %eax, %ebx
 ; X86-NOSSE-NEXT:    andl %ecx, %ebx
 ; X86-NOSSE-NEXT:    shrl $2, %eax
 ; X86-NOSSE-NEXT:    andl %ecx, %eax
 ; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %eax, %ecx
+; X86-NOSSE-NEXT:    shrl $4, %ecx
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    andl %eax, %edi
+; X86-NOSSE-NEXT:    andl %eax, %ecx
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    imull $16843009, %ecx, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    addl %eax, %ecx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    leal -12(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
@@ -1057,13 +1083,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-POPCNT-LABEL: cnt128_optsize:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1071,7 +1101,9 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_optsize:
@@ -1118,7 +1150,11 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ;
 ; X86-SSE2-LABEL: cnt128_optsize:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1161,11 +1197,17 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_optsize:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1198,6 +1240,8 @@ define i128 @cnt128_optsize(i128 %x) nounwind readnone optsize {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
@@ -1415,85 +1459,88 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt128_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
+; X86-NOSSE-NEXT:    pushl %ebp
+; X86-NOSSE-NEXT:    movl %esp, %ebp
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %edi
-; X86-NOSSE-NEXT:    movl %edi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %edi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    andl $-16, %esp
+; X86-NOSSE-NEXT:    movl 24(%ebp), %eax
+; X86-NOSSE-NEXT:    movl 32(%ebp), %ecx
+; X86-NOSSE-NEXT:    movl 36(%ebp), %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
 ; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %ebx, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
-; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    movl %edx, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %edx, %edi
-; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl %edi
-; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    subl %edi, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, %edi
-; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %ecx
 ; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
 ; X86-NOSSE-NEXT:    movl %ecx, %edi
 ; X86-NOSSE-NEXT:    shrl $4, %edi
 ; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    movl 28(%ebp), %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
 ; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
 ; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    addl %edx, %ecx
-; X86-NOSSE-NEXT:    addl %esi, %ecx
-; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, (%eax)
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %esi, %edx
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl %esi
+; X86-NOSSE-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    movl %eax, %esi
+; X86-NOSSE-NEXT:    shrl $4, %esi
+; X86-NOSSE-NEXT:    addl %eax, %esi
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %eax
+; X86-NOSSE-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %esi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    movl 8(%ebp), %eax
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    xorl %ecx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    leal -8(%ebp), %esp
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
+; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-BASE-LABEL: cnt128_pgso:
@@ -1538,13 +1585,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-POPCNT-LABEL: cnt128_pgso:
 ; X86-POPCNT:       # %bb.0:
+; X86-POPCNT-NEXT:    pushl %ebp
+; X86-POPCNT-NEXT:    movl %esp, %ebp
 ; X86-POPCNT-NEXT:    pushl %esi
-; X86-POPCNT-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %edx
+; X86-POPCNT-NEXT:    andl $-16, %esp
+; X86-POPCNT-NEXT:    subl $16, %esp
+; X86-POPCNT-NEXT:    movl 8(%ebp), %eax
+; X86-POPCNT-NEXT:    popcntl 36(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 32(%ebp), %edx
 ; X86-POPCNT-NEXT:    addl %ecx, %edx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %ecx
-; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %esi
+; X86-POPCNT-NEXT:    popcntl 28(%ebp), %ecx
+; X86-POPCNT-NEXT:    popcntl 24(%ebp), %esi
 ; X86-POPCNT-NEXT:    addl %ecx, %esi
 ; X86-POPCNT-NEXT:    addl %edx, %esi
 ; X86-POPCNT-NEXT:    xorl %ecx, %ecx
@@ -1552,7 +1603,9 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-POPCNT-NEXT:    movl %ecx, 8(%eax)
 ; X86-POPCNT-NEXT:    movl %ecx, 4(%eax)
 ; X86-POPCNT-NEXT:    movl %esi, (%eax)
+; X86-POPCNT-NEXT:    leal -4(%ebp), %esp
 ; X86-POPCNT-NEXT:    popl %esi
+; X86-POPCNT-NEXT:    popl %ebp
 ; X86-POPCNT-NEXT:    retl $4
 ;
 ; X64-POPCNT-LABEL: cnt128_pgso:
@@ -1599,7 +1652,11 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ;
 ; X86-SSE2-LABEL: cnt128_pgso:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE2-NEXT:    pushl %ebp
+; X86-SSE2-NEXT:    movl %esp, %ebp
+; X86-SSE2-NEXT:    andl $-16, %esp
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movl 8(%ebp), %eax
 ; X86-SSE2-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    psrlw $1, %xmm0
@@ -1642,11 +1699,17 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSE2-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSE2-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSE2-NEXT:    movl %edx, (%eax)
+; X86-SSE2-NEXT:    movl %ebp, %esp
+; X86-SSE2-NEXT:    popl %ebp
 ; X86-SSE2-NEXT:    retl $4
 ;
 ; X86-SSSE3-LABEL: cnt128_pgso:
 ; X86-SSSE3:       # %bb.0:
-; X86-SSSE3-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSSE3-NEXT:    pushl %ebp
+; X86-SSSE3-NEXT:    movl %esp, %ebp
+; X86-SSSE3-NEXT:    andl $-16, %esp
+; X86-SSSE3-NEXT:    subl $16, %esp
+; X86-SSSE3-NEXT:    movl 8(%ebp), %eax
 ; X86-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
 ; X86-SSSE3-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
 ; X86-SSSE3-NEXT:    movdqa %xmm2, %xmm3
@@ -1679,6 +1742,8 @@ define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-SSSE3-NEXT:    movl %ecx, 8(%eax)
 ; X86-SSSE3-NEXT:    movl %ecx, 4(%eax)
 ; X86-SSSE3-NEXT:    movl %edx, (%eax)
+; X86-SSSE3-NEXT:    movl %ebp, %esp
+; X86-SSSE3-NEXT:    popl %ebp
 ; X86-SSSE3-NEXT:    retl $4
   %cnt = tail call i128 @llvm.ctpop.i128(i128 %x)
   ret i128 %cnt
diff --git a/llvm/test/CodeGen/X86/pr154492.ll b/llvm/test/CodeGen/X86/pr154492.ll
new file mode 100644
index 000000000000..1ba17594976e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr154492.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f  | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+define <16 x i32> @PR154492() {
+; AVX512F-LABEL: PR154492:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512F-NEXT:    vmovaps %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: PR154492:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> zeroinitializer, <16 x i32> zeroinitializer, i16 255, i32 4)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/pr156256.ll b/llvm/test/CodeGen/X86/pr156256.ll
new file mode 100644
index 000000000000..13caa6fee587
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr156256.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=AVX512VL
+
+define <16 x i16> @PR156256(<16 x i32> %a, <16 x i32> %b) {
+; AVX512-LABEL: PR156256:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    retq
+;
+; AVX512VL-LABEL: PR156256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512VL-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; AVX512VL-NEXT:    retq
+  %icmp = icmp ugt <16 x i32> %a, %b
+  %sext = sext <16 x i1> %icmp to <16 x i16>
+  %and = and <16 x i16> %sext, splat (i16 16256)
+  ret <16 x i16> %and
+}
diff --git a/llvm/test/CodeGen/X86/pr46004.ll b/llvm/test/CodeGen/X86/pr46004.ll
index f7c7da089c36..829d6dfceba3 100644
--- a/llvm/test/CodeGen/X86/pr46004.ll
+++ b/llvm/test/CodeGen/X86/pr46004.ll
@@ -6,7 +6,17 @@
 define void @fuzz22357(i128 %a0) {
 ; X86-LABEL: fuzz22357:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
 ; X86-NEXT:    movb $0, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22357:
@@ -24,6 +34,15 @@ define void @fuzz22357(i128 %a0) {
 define void @fuzz22723(i128 %a0) {
 ; X86-LABEL: fuzz22723:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: fuzz22723:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
index 50a967e1c2a1..ce9723b3a84b 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i32.ll
@@ -762,11 +762,15 @@ define i32 @x_to_s32(x86_fp80 %a) nounwind {
 define i32 @t_to_u32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u32:
@@ -797,12 +801,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u32:
@@ -835,12 +845,18 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u32:
@@ -860,11 +876,15 @@ define i32 @t_to_u32(fp128 %a) nounwind {
 define i32 @t_to_s32(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s32:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfsi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s32:
@@ -895,12 +915,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s32:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfsi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s32:
@@ -933,12 +959,18 @@ define i32 @t_to_s32(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s32:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfsi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s32:
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index f516db8b30ff..3287869f2c60 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -1417,11 +1417,15 @@ define i64 @x_to_s64(x86_fp80 %a) nounwind {
 define i64 @t_to_u64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_u64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixunstfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_u64:
@@ -1452,12 +1456,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_u64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixunstfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_u64:
@@ -1490,12 +1500,18 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_u64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixunstfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_u64:
@@ -1515,11 +1531,15 @@ define i64 @t_to_u64(fp128 %a) nounwind {
 define i64 @t_to_s64(fp128 %a) nounwind {
 ; X86-AVX512-WIN-LABEL: t_to_s64:
 ; X86-AVX512-WIN:       # %bb.0:
-; X86-AVX512-WIN-NEXT:    subl $16, %esp
-; X86-AVX512-WIN-NEXT:    vmovups {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-WIN-NEXT:    pushl %ebp
+; X86-AVX512-WIN-NEXT:    movl %esp, %ebp
+; X86-AVX512-WIN-NEXT:    andl $-16, %esp
+; X86-AVX512-WIN-NEXT:    subl $32, %esp
+; X86-AVX512-WIN-NEXT:    vmovups 8(%ebp), %xmm0
 ; X86-AVX512-WIN-NEXT:    vmovups %xmm0, (%esp)
 ; X86-AVX512-WIN-NEXT:    calll ___fixtfdi
-; X86-AVX512-WIN-NEXT:    addl $16, %esp
+; X86-AVX512-WIN-NEXT:    movl %ebp, %esp
+; X86-AVX512-WIN-NEXT:    popl %ebp
 ; X86-AVX512-WIN-NEXT:    retl
 ;
 ; X86-AVX512-LIN-LABEL: t_to_s64:
@@ -1550,12 +1570,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X86-SSE-WIN-LABEL: t_to_s64:
 ; X86-SSE-WIN:       # %bb.0:
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X86-SSE-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-SSE-WIN-NEXT:    pushl %ebp
+; X86-SSE-WIN-NEXT:    movl %esp, %ebp
+; X86-SSE-WIN-NEXT:    andl $-16, %esp
+; X86-SSE-WIN-NEXT:    subl $16, %esp
+; X86-SSE-WIN-NEXT:    pushl 20(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 16(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 12(%ebp)
+; X86-SSE-WIN-NEXT:    pushl 8(%ebp)
 ; X86-SSE-WIN-NEXT:    calll ___fixtfdi
 ; X86-SSE-WIN-NEXT:    addl $16, %esp
+; X86-SSE-WIN-NEXT:    movl %ebp, %esp
+; X86-SSE-WIN-NEXT:    popl %ebp
 ; X86-SSE-WIN-NEXT:    retl
 ;
 ; X86-SSE-LIN-LABEL: t_to_s64:
@@ -1588,12 +1614,18 @@ define i64 @t_to_s64(fp128 %a) nounwind {
 ;
 ; X87-WIN-LABEL: t_to_s64:
 ; X87-WIN:       # %bb.0:
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
-; X87-WIN-NEXT:    pushl {{[0-9]+}}(%esp)
+; X87-WIN-NEXT:    pushl %ebp
+; X87-WIN-NEXT:    movl %esp, %ebp
+; X87-WIN-NEXT:    andl $-16, %esp
+; X87-WIN-NEXT:    subl $16, %esp
+; X87-WIN-NEXT:    pushl 20(%ebp)
+; X87-WIN-NEXT:    pushl 16(%ebp)
+; X87-WIN-NEXT:    pushl 12(%ebp)
+; X87-WIN-NEXT:    pushl 8(%ebp)
 ; X87-WIN-NEXT:    calll ___fixtfdi
 ; X87-WIN-NEXT:    addl $16, %esp
+; X87-WIN-NEXT:    movl %ebp, %esp
+; X87-WIN-NEXT:    popl %ebp
 ; X87-WIN-NEXT:    retl
 ;
 ; X87-LIN-LABEL: t_to_s64:
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 874913629e9e..8a287229a1cb 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -118,30 +118,33 @@ define i8 @scmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: scmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ecx
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %ecx
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    cmpl %ecx, 8(%ebp)
+; X86-NEXT:    sbbl %eax, %edi
+; X86-NEXT:    movl 32(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl 36(%ebp), %ebx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    setl %cl
-; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    movl 24(%ebp), %esi
+; X86-NEXT:    cmpl 8(%ebp), %esi
+; X86-NEXT:    sbbl 12(%ebp), %eax
+; X86-NEXT:    sbbl 16(%ebp), %edi
+; X86-NEXT:    sbbl %edx, %ebx
 ; X86-NEXT:    setl %al
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/sdiv_fix.ll b/llvm/test/CodeGen/X86/sdiv_fix.ll
index 4925f8bc6c8b..392bc83d9d5d 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix.ll
@@ -307,69 +307,70 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
+; X86-NEXT:    subl $112, %esp
 ; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %eax
 ; X86-NEXT:    movl 20(%ebp), %edx
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %esi
 ; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %ecx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %esi
+; X86-NEXT:    shldl $31, %ecx, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %ebx
-; X86-NEXT:    testl %esi, %esi
-; X86-NEXT:    sets %al
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %cl
-; X86-NEXT:    xorb %al, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %esi
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    sets %bl
+; X86-NEXT:    xorb %al, %bl
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    testb %bl, %al
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab617..7df490f98492 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -370,67 +370,68 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $96, %esp
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ebp), %eax
-; X86-NEXT:    movl 20(%ebp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $31, %eax, %edi
-; X86-NEXT:    shldl $31, %ecx, %eax
+; X86-NEXT:    subl $128, %esp
+; X86-NEXT:    movl 8(%ebp), %esi
+; X86-NEXT:    movl 12(%ebp), %edi
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    movl 20(%ebp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    sarl $31, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll $31, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    shldl $31, %edi, %ebx
+; X86-NEXT:    shldl $31, %esi, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    shll $31, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 20(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 16(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl $1, %esi
-; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $1, %edi
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %ebx, %ebx
-; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sbbl $0, %ebx
 ; X86-NEXT:    testl %ecx, %ecx
-; X86-NEXT:    sets %dl
-; X86-NEXT:    xorb %al, %dl
-; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    sets %al
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -438,41 +439,38 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    cmovel %esi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    cmpl $-1, %esi
-; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmpl $-1, %edi
 ; X86-NEXT:    sbbl $2147483647, %ecx # imm = 0x7FFFFFFF
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
-; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    sbbl $0, %ecx
 ; X86-NEXT:    movl $2147483647, %edx # imm = 0x7FFFFFFF
-; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    cmovll {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovgel %ecx, %edi
-; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    cmovgel %ecx, %ebx
+; X86-NEXT:    cmovgel %ecx, %eax
 ; X86-NEXT:    movl $-1, %ecx
-; X86-NEXT:    cmovgel %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    negl %edi
-; X86-NEXT:    movl $-2147483648, %edi # imm = 0x80000000
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl $-1, %edi
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    cmovgel %ecx, %edi
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    movl $-2147483648, %esi # imm = 0x80000000
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl $-1, %esi
+; X86-NEXT:    sbbl %eax, %esi
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
+; X86-NEXT:    cmovgel %eax, %edi
 ; X86-NEXT:    movl $-2147483648, %eax # imm = 0x80000000
 ; X86-NEXT:    cmovgel %eax, %edx
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -805,137 +803,155 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $208, %esp
-; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    subl $240, %esp
+; X86-NEXT:    movl 12(%ebp), %esi
+; X86-NEXT:    movl 20(%ebp), %edi
 ; X86-NEXT:    movl 16(%ebp), %ebx
-; X86-NEXT:    movl 32(%ebp), %eax
-; X86-NEXT:    movl %eax, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    sarl $31, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 32(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
 ; X86-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NEXT:    shrl $31, %ebx
 ; X86-NEXT:    shldl $31, %eax, %ebx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 36(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%edi,%edi), %eax
+; X86-NEXT:    shrl $31, %edi
+; X86-NEXT:    shldl $31, %eax, %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl 32(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 36(%ebp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 28(%ebp), %edx
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sarl $31, %ebx
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    sarl $31, %edi
-; X86-NEXT:    leal (%ecx,%ecx), %eax
-; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    sarl $31, %ebx
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%esi,%esi), %eax
+; X86-NEXT:    shrl $31, %esi
+; X86-NEXT:    shldl $31, %eax, %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl 40(%ebp), %esi
-; X86-NEXT:    sarl $31, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    sarl $31, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 24(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    leal (%ecx,%ecx), %edx
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    leal (%ecx,%ecx), %eax
 ; X86-NEXT:    shrl $31, %ecx
-; X86-NEXT:    shldl $31, %edx, %ecx
+; X86-NEXT:    shldl $31, %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl 28(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl 28(%ebp)
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __divti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl 40(%ebp), %eax
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %eax
@@ -949,18 +965,18 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    sets {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    sets %bl
-; X86-NEXT:    testl %edi, %edi
-; X86-NEXT:    sets %bh
-; X86-NEXT:    xorb %bl, %bh
+; X86-NEXT:    xorb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %bl
-; X86-NEXT:    testb %bh, %bl
+; X86-NEXT:    setne %bh
+; X86-NEXT:    testb %bl, %bh
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
@@ -1107,36 +1123,24 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    subl $1, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl $0, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl $0, %edi
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    sets %al
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    sets %ah
-; X86-NEXT:    xorb %al, %ah
-; X86-NEXT:    movb %ah, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl 40(%ebp)
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %eax
+; X86-NEXT:    cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT:    sets %cl
+; X86-NEXT:    xorb %al, %cl
+; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    calll __modti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -1144,38 +1148,38 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    setne %al
 ; X86-NEXT:    testb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
 ; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
 ; X86-NEXT:    cmpl $-1, %ebx
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl $0, %eax
-; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %eax
 ; X86-NEXT:    sbbl $0, %eax
 ; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    cmovgel %eax, %esi
-; X86-NEXT:    cmovgel %eax, %ecx
 ; X86-NEXT:    cmovgel %eax, %edi
+; X86-NEXT:    cmovgel %eax, %ecx
+; X86-NEXT:    cmovgel %eax, %esi
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    cmovgel %edx, %ebx
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %edi, %eax
+; X86-NEXT:    sbbl %esi, %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $-1, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    sbbl %edi, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    cmovgel %eax, %ebx
-; X86-NEXT:    cmovgel %edx, %edi
-; X86-NEXT:    shldl $31, %ebx, %edi
+; X86-NEXT:    cmovgel %edx, %esi
+; X86-NEXT:    shldl $31, %ebx, %esi
 ; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 12(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, 8(%eax)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
index d2b292f1a799..2ac2be5545df 100644
--- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
+++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll
@@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind {
 ; CHECK-AVX2-NEXT:  .LBB0_2: # %vector.body
 ; CHECK-AVX2-NEXT:    # Parent Loop BB0_1 Depth=1
 ; CHECK-AVX2-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %ymm5
-; CHECK-AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
+; CHECK-AVX2-NEXT:    vmovdqu 1024(%rdx,%rsi), %xmm5
+; CHECK-AVX2-NEXT:    vmovdqu 1040(%rdx,%rsi), %xmm6
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm5, %rdi
 ; CHECK-AVX2-NEXT:    vpextrq $1, %xmm6, %r8
 ; CHECK-AVX2-NEXT:    vmovq %xmm5, %r9
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index 76cb4e87bae1..dfeef48897e0 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -792,14 +792,24 @@ define <4 x i32> @or_tree_with_mismatching_shifts_vec_i32(<4 x i32> %a, <4 x i32
 define void @combineShiftOfShiftedLogic(i128 %a1, i32 %a2, ptr %p) {
 ; X86-LABEL: combineShiftOfShiftedLogic:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    .cfi_def_cfa_register %ebp
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 24(%ebp), %eax
+; X86-NEXT:    movl 28(%ebp), %ecx
 ; X86-NEXT:    movl %eax, 20(%ecx)
 ; X86-NEXT:    movl $0, 16(%ecx)
 ; X86-NEXT:    movl $0, 12(%ecx)
 ; X86-NEXT:    movl $0, 8(%ecx)
 ; X86-NEXT:    movl $0, 4(%ecx)
 ; X86-NEXT:    movl $0, (%ecx)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    .cfi_def_cfa %esp, 4
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: combineShiftOfShiftedLogic:
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd772ab7a..9323cd5b1917 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -212,9 +212,18 @@ entry:
 }
 
 define void @test_lshr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_lshr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_lshr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_lshr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = lshr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -222,9 +231,18 @@ entry:
 }
 
 define void @test_ashr_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_ashr_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_ashr_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_ashr_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = ashr i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -232,9 +250,18 @@ entry:
 }
 
 define void @test_shl_i128_outofrange(i128 %x, ptr nocapture %r) nounwind {
-; ALL-LABEL: test_shl_i128_outofrange:
-; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    ret{{[l|q]}}
+; i686-LABEL: test_shl_i128_outofrange:
+; i686:       # %bb.0: # %entry
+; i686-NEXT:    pushl %ebp
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    movl %ebp, %esp
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    retl
+;
+; x86_64-LABEL: test_shl_i128_outofrange:
+; x86_64:       # %bb.0: # %entry
+; x86_64-NEXT:    retq
 entry:
 	%0 = shl i128 %x, -1
 	store i128 %0, ptr %r, align 16
@@ -874,26 +901,31 @@ define <2 x i256> @shl_zext_lshr_outofrange(<2 x i128> %a0) {
 define i128 @lshr_shl_mask(i128 %a0) {
 ; i686-LABEL: lshr_shl_mask:
 ; i686:       # %bb.0:
-; i686-NEXT:    pushl %edi
+; i686-NEXT:    pushl %ebp
 ; i686-NEXT:    .cfi_def_cfa_offset 8
+; i686-NEXT:    .cfi_offset %ebp, -8
+; i686-NEXT:    movl %esp, %ebp
+; i686-NEXT:    .cfi_def_cfa_register %ebp
+; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 12
-; i686-NEXT:    .cfi_offset %esi, -12
-; i686-NEXT:    .cfi_offset %edi, -8
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    .cfi_offset %esi, -16
+; i686-NEXT:    .cfi_offset %edi, -12
+; i686-NEXT:    movl 8(%ebp), %eax
+; i686-NEXT:    movl 24(%ebp), %ecx
+; i686-NEXT:    movl 28(%ebp), %edx
+; i686-NEXT:    movl 32(%ebp), %esi
 ; i686-NEXT:    movl $2147483647, %edi # imm = 0x7FFFFFFF
-; i686-NEXT:    andl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    andl 36(%ebp), %edi
 ; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl %esi, 8(%eax)
 ; i686-NEXT:    movl %edx, 4(%eax)
 ; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    leal -8(%ebp), %esp
 ; i686-NEXT:    popl %esi
-; i686-NEXT:    .cfi_def_cfa_offset 8
 ; i686-NEXT:    popl %edi
-; i686-NEXT:    .cfi_def_cfa_offset 4
+; i686-NEXT:    popl %ebp
+; i686-NEXT:    .cfi_def_cfa %esp, 4
 ; i686-NEXT:    retl $4
 ;
 ; x86_64-LABEL: lshr_shl_mask:
diff --git a/llvm/test/CodeGen/X86/smax.ll b/llvm/test/CodeGen/X86/smax.ll
index 86891e964d96..509d4443e930 100644
--- a/llvm/test/CodeGen/X86/smax.ll
+++ b/llvm/test/CodeGen/X86/smax.ll
@@ -151,31 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovll %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ebx
+; X86-NEXT:    cmovll 28(%ebp), %edi
+; X86-NEXT:    cmovll 32(%ebp), %edx
+; X86-NEXT:    cmovll %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -717,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/smin.ll b/llvm/test/CodeGen/X86/smin.ll
index 8907f6c4cd59..5e9fe27b41d2 100644
--- a/llvm/test/CodeGen/X86/smin.ll
+++ b/llvm/test/CodeGen/X86/smin.ll
@@ -151,32 +151,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovll %ebx, %edx
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovll {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovll %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovll 24(%ebp), %ecx
+; X86-NEXT:    cmovll 28(%ebp), %edx
+; X86-NEXT:    cmovll 32(%ebp), %esi
+; X86-NEXT:    cmovll %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -718,29 +720,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovll %esi, %ecx
-; X86-NEXT:    cmovll %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovll %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 6a52acfe2fb3..7f17299b39e3 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -107,29 +107,33 @@ define i8 @ucmp.8.128(i128 %x, i128 %y) nounwind {
 ; X86-LABEL: ucmp.8.128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl 12(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %ecx
+; X86-NEXT:    cmpl %eax, 24(%ebp)
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    movl 16(%ebp), %ebx
+; X86-NEXT:    movl 32(%ebp), %eax
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    movl 20(%ebp), %ecx
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    setb %al
-; X86-NEXT:    cmpl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ebp, %esi
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    sbbl %edi, %ecx
+; X86-NEXT:    movl 8(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %edi
+; X86-NEXT:    sbbl 28(%ebp), %edx
+; X86-NEXT:    sbbl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/udiv_fix.ll b/llvm/test/CodeGen/X86/udiv_fix.ll
index 5b1e0545502b..82dfeeee1329 100644
--- a/llvm/test/CodeGen/X86/udiv_fix.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix.ll
@@ -153,26 +153,28 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    subl $4, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    leal -4(%ebp), %esp
 ; X86-NEXT:    popl %esi
diff --git a/llvm/test/CodeGen/X86/udiv_fix_sat.ll b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
index 30a7f80b2315..3da5973f9f90 100644
--- a/llvm/test/CodeGen/X86/udiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/udiv_fix_sat.ll
@@ -194,32 +194,34 @@ define i64 @func5(i64 %x, i64 %y) nounwind {
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $32, %esp
+; X86-NEXT:    subl $80, %esp
 ; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl 12(%ebp), %ecx
+; X86-NEXT:    movl 16(%ebp), %edx
+; X86-NEXT:    movl 20(%ebp), %esi
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    shrl %edx
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shldl $31, %eax, %ecx
+; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    shll $31, %eax
-; X86-NEXT:    movl %esp, %esi
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl 20(%ebp)
-; X86-NEXT:    pushl 16(%ebp)
-; X86-NEXT:    pushl $0
-; X86-NEXT:    pushl %edx
-; X86-NEXT:    pushl %ecx
-; X86-NEXT:    pushl %eax
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, (%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    calll __udivti3
-; X86-NEXT:    addl $32, %esp
+; X86-NEXT:    subl $4, %esp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl $-1, %eax
 ; X86-NEXT:    movl $-1, %edx
 ; X86-NEXT:    jne .LBB4_2
 ; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:  .LBB4_2:
 ; X86-NEXT:    leal -4(%ebp), %esp
diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll
index f589d4a7b04a..7ef859978cdb 100644
--- a/llvm/test/CodeGen/X86/umax.ll
+++ b/llvm/test/CodeGen/X86/umax.ll
@@ -232,31 +232,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl %ebx, %edx
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ebx
+; X86-NEXT:    movl 44(%ebp), %edi
+; X86-NEXT:    cmpl 24(%ebp), %ebx
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    sbbl 28(%ebp), %esi
+; X86-NEXT:    movl 48(%ebp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    sbbl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %esi
+; X86-NEXT:    movl 52(%ebp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    cmovbl %ebp, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbl %esi, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    cmovbl 32(%ebp), %edx
+; X86-NEXT:    cmovbl %esi, %ecx
+; X86-NEXT:    movl 8(%ebp), %eax
 ; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -282,37 +285,40 @@ define i128 @test_i128_1(i128 %a) nounwind {
 ; X86-LABEL: test_i128_1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 32(%ebp), %edx
+; X86-NEXT:    movl 24(%ebp), %eax
 ; X86-NEXT:    cmpl $1, %eax
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    adcl $0, %ebx
-; X86-NEXT:    testl %edx, %edx
-; X86-NEXT:    movl $1, %edi
-; X86-NEXT:    cmovnel %eax, %edi
-; X86-NEXT:    cmovel %ebx, %edi
-; X86-NEXT:    xorl %ebx, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    negl %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl $1, %ebp
-; X86-NEXT:    cmovbl %eax, %ebp
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    cmovbl %edx, %ebx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    cmovel %edi, %ebp
-; X86-NEXT:    cmovel %edx, %ebx
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    movl %ebp, (%eax)
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    cmpl $0, 28(%ebp)
+; X86-NEXT:    movl $1, %esi
+; X86-NEXT:    cmovnel %eax, %esi
+; X86-NEXT:    cmovel %ecx, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    movl %edx, %ecx
+; X86-NEXT:    negl %ecx
+; X86-NEXT:    movl 36(%ebp), %ecx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $1, %ebx
+; X86-NEXT:    cmovbl %eax, %ebx
+; X86-NEXT:    cmovbl 28(%ebp), %edi
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    cmovel %esi, %ebx
+; X86-NEXT:    cmovel 28(%ebp), %edi
+; X86-NEXT:    movl %edi, 4(%eax)
+; X86-NEXT:    movl %ebx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -1312,29 +1318,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %esi, %ecx
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %edx, %ebx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sbbl %eax, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umin.ll b/llvm/test/CodeGen/X86/umin.ll
index 7a5cdbb9ce75..c927abf3a426 100644
--- a/llvm/test/CodeGen/X86/umin.ll
+++ b/llvm/test/CodeGen/X86/umin.ll
@@ -147,32 +147,34 @@ define i128 @test_i128(i128 %a, i128 %b) nounwind {
 ; X86-LABEL: test_i128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl %edx, %ebx
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    sbbl %ecx, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 40(%ebp), %ecx
+; X86-NEXT:    movl 44(%ebp), %edx
+; X86-NEXT:    movl 28(%ebp), %esi
+; X86-NEXT:    cmpl %ecx, 24(%ebp)
+; X86-NEXT:    sbbl %edx, %esi
+; X86-NEXT:    movl 48(%ebp), %esi
+; X86-NEXT:    movl 32(%ebp), %ebx
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl 52(%ebp), %ebx
+; X86-NEXT:    movl 36(%ebp), %edi
 ; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    sbbl %ebp, %eax
-; X86-NEXT:    cmovbl %ebx, %edx
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmovbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmovbl %edi, %ebp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ebp, 12(%eax)
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %esi, 4(%eax)
-; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    cmovbl 24(%ebp), %ecx
+; X86-NEXT:    cmovbl 28(%ebp), %edx
+; X86-NEXT:    cmovbl 32(%ebp), %esi
+; X86-NEXT:    cmovbl %edi, %ebx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
@@ -727,29 +729,32 @@ define i128 @test_signbits_i128(i128 %a, i128 %b) nounwind {
 ;
 ; X86-LABEL: test_signbits_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    shrdl $28, %edi, %ecx
-; X86-NEXT:    sarl $28, %edi
+; X86-NEXT:    andl $-16, %esp
+; X86-NEXT:    movl 32(%ebp), %esi
+; X86-NEXT:    movl 36(%ebp), %eax
+; X86-NEXT:    movl 48(%ebp), %ecx
+; X86-NEXT:    movl 52(%ebp), %edx
+; X86-NEXT:    shrdl $28, %edx, %ecx
+; X86-NEXT:    sarl $28, %edx
 ; X86-NEXT:    cmpl %ecx, %esi
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    movl %eax, %edi
+; X86-NEXT:    sbbl %edx, %edi
 ; X86-NEXT:    cmovbl %esi, %ecx
-; X86-NEXT:    cmovbl %edx, %edi
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    sarl $31, %edi
+; X86-NEXT:    cmovbl %eax, %edx
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl %edx, 4(%eax)
 ; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl %edi, 8(%eax)
+; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    leal -8(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
   %ax = ashr i128 %a, 64
   %bx = ashr i128 %b, 92
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 4c3170304b98..89afd1b00444 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -38,8 +38,8 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 20
-; X86-NEXT:    subl $24, %esp
-; X86-NEXT:    .cfi_def_cfa_offset 44
+; X86-NEXT:    subl $28, %esp
+; X86-NEXT:    .cfi_def_cfa_offset 48
 ; X86-NEXT:    .cfi_offset %esi, -20
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
@@ -147,7 +147,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
 ; X86-NEXT:    andb $1, %al
 ; X86-NEXT:    movb %al, 16(%ecx)
 ; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    addl $28, %esp
 ; X86-NEXT:    .cfi_def_cfa_offset 20
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 087cd30abee9..9bd38db3e9c4 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -104,6 +104,72 @@ entry:
 }
 declare <2 x double> @foo()
 
+define i64 @pr150117(<31 x i8> %a0) nounwind {
+; X86-LABEL: pr150117:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shll $8, %edx
+; X86-NEXT:    orl %ebx, %edx
+; X86-NEXT:    shll $8, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    shll $16, %ecx
+; X86-NEXT:    orl %edi, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shll $24, %esi
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    movd %esi, %xmm0
+; X86-NEXT:    pinsrw $2, %edx, %xmm0
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $8, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    pinsrw $3, %ecx, %xmm0
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movd %xmm0, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: pr150117:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
+; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
+; X64-NEXT:    shll $8, %r8d
+; X64-NEXT:    orl %edi, %r8d
+; X64-NEXT:    shll $8, %esi
+; X64-NEXT:    orl %edx, %esi
+; X64-NEXT:    shll $16, %ecx
+; X64-NEXT:    orl %esi, %ecx
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    shll $24, %edx
+; X64-NEXT:    orl %ecx, %edx
+; X64-NEXT:    movd %edx, %xmm0
+; X64-NEXT:    pinsrw $2, %r8d, %xmm0
+; X64-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    pinsrw $3, %ecx, %xmm0
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    retq
+  %shuffle = shufflevector <31 x i8> %a0, <31 x i8> zeroinitializer, <32 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bitcast = bitcast <32 x i8> %shuffle to <4 x i64>
+  %elt = extractelement <4 x i64> %bitcast, i64 0
+  ret i64 %elt
+}
+
 ; OSS-Fuzz #15662
 ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15662
 define <4 x i32> @ossfuzz15662(ptr %in) {
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135ddcec9..ef20cf2a09bb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -1724,6 +1724,269 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
   ret void
 }
 
+define <512 x i8> @PR153457(<512 x i8> %a0, <512 x i8> %a1) nounwind {
+; AVX512F-LABEL: PR153457:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %rbp
+; AVX512F-NEXT:    movq %rsp, %rbp
+; AVX512F-NEXT:    andq $-64, %rsp
+; AVX512F-NEXT:    subq $64, %rsp
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512F-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512F-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512F-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512F-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512F-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512F-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512F-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512F-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512F-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512F-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512F-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512F-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512F-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512F-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512F-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512F-NEXT:    movq %rbp, %rsp
+; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: PR153457:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    pushq %rbp
+; AVX512BW-NEXT:    movq %rsp, %rbp
+; AVX512BW-NEXT:    andq $-64, %rsp
+; AVX512BW-NEXT:    subq $64, %rsp
+; AVX512BW-NEXT:    movq %rdi, %rax
+; AVX512BW-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512BW-NEXT:    vpbroadcastq %xmm0, %ymm8
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm2, %ymm8
+; AVX512BW-NEXT:    vpbroadcastd %xmm0, %ymm10
+; AVX512BW-NEXT:    vpblendvb %ymm9, %ymm8, %ymm10, %ymm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm5, %xmm8
+; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512BW-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
+; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512BW-NEXT:    vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-NEXT:    vinserti32x4 $2, %xmm9, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpslld $24, %xmm0, %xmm9
+; AVX512BW-NEXT:    vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
+; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT:    vporq %zmm8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vinserti128 $1, %xmm7, %ymm1, %ymm8
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm2, %zmm8
+; AVX512BW-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm7[0],zero,xmm7[1],zero
+; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT:    vpsrld $24, %xmm7, %xmm3
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm3
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vinserti32x4 $3, %xmm7, %zmm4, %zmm4
+; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,1,2,0]
+; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT:    vpbroadcastb 16(%rbp), %ymm9
+; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm6[0,1,2,3],zmm0[4,5,6,7]
+; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm6
+; AVX512BW-NEXT:    vmovdqa64 %zmm6, 320(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm3, 384(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm8, 128(%rdi)
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512BW-NEXT:    movq %rbp, %rsp
+; AVX512BW-NEXT:    popq %rbp
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: PR153457:
+; AVX512DQ:       # %bb.0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:    andq $-64, %rsp
+; AVX512DQ-NEXT:    subq $64, %rsp
+; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    vpbroadcastq %xmm0, %ymm7
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0]
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm7
+; AVX512DQ-NEXT:    vpbroadcastd %xmm0, %ymm9
+; AVX512DQ-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm5, %xmm7
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[0,2,4,6,8,10,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm4, %xmm9
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,128]
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[5]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[1]
+; AVX512DQ-NEXT:    vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-NEXT:    vpslld $24, %xmm0, %xmm11
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm3, %ymm3
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,2,4,6,8,10,12,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vmovdqa 16(%rbp), %xmm11
+; AVX512DQ-NEXT:    vpsrld $16, %xmm11, %xmm12
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm12[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm8 = xmm11[0],zero,xmm11[1],zero
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vpsrld $24, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm10, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm8[0,1,2,3],zmm1[4,5,6,7]
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm11, %ymm9, %ymm8
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,21,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm8, %zmm4, %zmm4
+; AVX512DQ-NEXT:    vpsrlq $48, %xmm11, %xmm8
+; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm8[0]
+; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm7, %zmm5, %zmm5
+; AVX512DQ-NEXT:    vpermq {{.*#+}} ymm7 = ymm0[0,1,2,0]
+; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT:    vpbroadcastb 16(%rbp), %ymm8
+; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[4,5,6,7]
+; AVX512DQ-NEXT:    vpsrlq $56, %xmm11, %xmm7
+; AVX512DQ-NEXT:    vmovdqa %ymm7, 416(%rdi)
+; AVX512DQ-NEXT:    vmovdqa %ymm6, 384(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm2, 128(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512DQ-NEXT:    vmovdqa64 %zmm1, 64(%rdi)
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512VBMI-LABEL: PR153457:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %rbp
+; AVX512VBMI-NEXT:    movq %rsp, %rbp
+; AVX512VBMI-NEXT:    andq $-64, %rsp
+; AVX512VBMI-NEXT:    subq $64, %rsp
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vmovdqa64 16(%rbp), %zmm7
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,70,0,0,0,0,0,0,0,0,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm5, %zmm8
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,69,0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm4, %zmm5
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = [32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,68,0,0,0,0,0,0,0,0]
+; AVX512VBMI-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm3, %zmm4
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67,0,1,2,3,4,5,6,66,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,67]
+; AVX512VBMI-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm2, %zmm3
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm1 = zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],zero,zmm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vporq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,71]
+; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm6, %zmm2
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,0,64,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm0, %zmm6
+; AVX512VBMI-NEXT:    vpmovsxbw {{.*#+}} zmm0 = [0,1,2,3,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,20,21,22,23,24,25,26,27,28,29,30,31]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm3, %zmm0
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [67,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,68,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm4, %zmm3
+; AVX512VBMI-NEXT:    vinserti32x4 $3, %xmm7, %zmm5, %zmm4
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm4 = zmm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,53,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15,16,17,18,19,35,0,0,0,8,9,10,11,12,13,14,15]
+; AVX512VBMI-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpermi2w %zmm7, %zmm8, %zmm5
+; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512VBMI-NEXT:    vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,39,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,65,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63]
+; AVX512VBMI-NEXT:    vpermi2b %zmm7, %zmm1, %zmm8
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm5, 320(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm4, 256(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm3, 192(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, 128(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm8, 64(%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm6, (%rdi)
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm2, 384(%rdi)
+; AVX512VBMI-NEXT:    movq %rbp, %rsp
+; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %shuffle1 = shufflevector <512 x i8> %a0, <512 x i8> zeroinitializer, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %shuffle2 = shufflevector <512 x i8> %shuffle1, <512 x i8> %a1, <512 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 512, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 513, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127, i32 128, i32 129, i32 130, i32 131, i32 132, i32 133, i32 134, i32 135, i32 514, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 168, i32 169, i32 170, i32 171, i32 172, i32 173, i32 174, i32 175, i32 176, i32 177, i32 178, i32 179, i32 180, i32 181, i32 182, i32 183, i32 184, i32 185, i32 186, i32 187, i32 188, i32 189, i32 190, i32 191, i32 515, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 224, i32 225, i32 226, i32 227, i32 228, i32 229, i32 230, i32 231, i32 232, i32 233, i32 234, i32 235, i32 236, i32 237, i32 238, i32 239, i32 240, i32 241, i32 242, i32 243, i32 244, i32 245, i32 246, i32 247, i32 516, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 280, i32 281, i32 282, i32 283, i32 284, i32 285, i32 286, i32 287, i32 288, i32 289, i32 290, i32 291, i32 292, i32 293, i32 294, i32 295, i32 296, i32 297, i32 298, i32 299, i32 300, i32 301, i32 302, i32 303, i32 517, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 336, i32 337, i32 338, i32 339, i32 340, i32 341, i32 342, i32 343, i32 344, i32 345, i32 346, i32 347, i32 348, i32 349, i32 350, i32 351, i32 352, i32 353, i32 354, i32 355, i32 356, i32 357, i32 358, i32 359, i32 518, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 392, i32 393, i32 394, i32 395, i32 396, i32 397, i32 398, i32 399, i32 400, i32 401, i32 402, i32 403, i32 404, i32 405, i32 406, i32 407, i32 408, i32 409, i32 410, i32 411, i32 412, i32 413, i32 414, i32 415, i32 519, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  ret <512 x i8> %shuffle2
+}
+
 define <64 x i8> @shuffle_v32i16_zextinreg_to_v16i32(<64 x i8> %a)  {
 ; ALL-LABEL: shuffle_v32i16_zextinreg_to_v16i32:
 ; ALL:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/wide-integer-cmp.ll b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
index a15d633d8538..12dccca76eb1 100644
--- a/llvm/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/llvm/test/CodeGen/X86/wide-integer-cmp.ll
@@ -92,6 +92,8 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    .cfi_offset %esi, -8
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -101,15 +103,15 @@ define i32 @test_wide(i128 %a, i128 %b) {
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jge .LBB4_2
+; CHECK-NEXT:    jge .LBB4_3
 ; CHECK-NEXT:  # %bb.1: # %bb1
 ; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    popl %esi
-; CHECK-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-NEXT:    retl
-; CHECK-NEXT:  .LBB4_2: # %bb2
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    jmp .LBB4_2
+; CHECK-NEXT:  .LBB4_3: # %bb2
 ; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:  .LBB4_2: # %bb1
+; CHECK-NEXT:    addl $8, %esp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
new file mode 100644
index 000000000000..5ac90a0af2e5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win32-int-runtime-libcalls.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck -check-prefix=CHECK32 %s
+; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck -check-prefix=CHECK64 %s
+
+define i64 @test_sdiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_sdiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __alldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_sdiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    retq
+  %ret = sdiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_srem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_srem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __allrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_srem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    cqto
+; CHECK64-NEXT:    idivq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = srem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_udiv_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_udiv_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aulldiv
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_udiv_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    retq
+  %ret = udiv i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_urem_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_urem_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    pushl {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    calll __aullrem
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_urem_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rdx, %r8
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    xorl %edx, %edx
+; CHECK64-NEXT:    divq %r8
+; CHECK64-NEXT:    movq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = urem i64 %a, %b
+  ret i64 %ret
+}
+
+define i64 @test_mul_i64(i64 %a, i64 %b) {
+; CHECK32-LABEL: test_mul_i64:
+; CHECK32:       # %bb.0:
+; CHECK32-NEXT:    pushl %esi
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    movl %ecx, %eax
+; CHECK32-NEXT:    mull %esi
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    addl %ecx, %edx
+; CHECK32-NEXT:    imull {{[0-9]+}}(%esp), %esi
+; CHECK32-NEXT:    addl %esi, %edx
+; CHECK32-NEXT:    popl %esi
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: test_mul_i64:
+; CHECK64:       # %bb.0:
+; CHECK64-NEXT:    movq %rcx, %rax
+; CHECK64-NEXT:    imulq %rdx, %rax
+; CHECK64-NEXT:    retq
+  %ret = mul i64 %a, %b
+  ret i64 %ret
+}
diff --git a/llvm/test/CodeGen/Xtensa/select-cc-fp.ll b/llvm/test/CodeGen/Xtensa/select-cc-fp.ll
index ee45ef006123..742770de23f3 100644
--- a/llvm/test/CodeGen/Xtensa/select-cc-fp.ll
+++ b/llvm/test/CodeGen/Xtensa/select-cc-fp.ll
@@ -103,8 +103,8 @@ define float @brcc_olt(float %a, float %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    wfr f8, a3
 ; CHECK-NEXT:    wfr f9, a2
-; CHECK-NEXT:    ule.s b0, f8, f9
-; CHECK-NEXT:    bt b0, .LBB3_2
+; CHECK-NEXT:    olt.s b0, f9, f8
+; CHECK-NEXT:    bf b0, .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %t1
 ; CHECK-NEXT:    l32r a8, .LCPI3_1
 ; CHECK-NEXT:    wfr f8, a8
@@ -135,8 +135,8 @@ define float @brcc_ole(float %a, float %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    wfr f8, a3
 ; CHECK-NEXT:    wfr f9, a2
-; CHECK-NEXT:    ult.s b0, f8, f9
-; CHECK-NEXT:    bt b0, .LBB4_2
+; CHECK-NEXT:    ole.s b0, f9, f8
+; CHECK-NEXT:    bf b0, .LBB4_2
 ; CHECK-NEXT:  # %bb.1: # %t1
 ; CHECK-NEXT:    l32r a8, .LCPI4_1
 ; CHECK-NEXT:    wfr f8, a8
@@ -232,7 +232,7 @@ define float @brcc_ueq(float %a, float %b) nounwind {
 ; CHECK-NEXT:    wfr f8, a3
 ; CHECK-NEXT:    wfr f9, a2
 ; CHECK-NEXT:    ueq.s b0, f9, f8
-; CHECK-NEXT:    bt b0, .LBB7_2
+; CHECK-NEXT:    bf b0, .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %t1
 ; CHECK-NEXT:    l32r a8, .LCPI7_1
 ; CHECK-NEXT:    wfr f8, a8
@@ -327,8 +327,8 @@ define float @brcc_ult(float %a, float %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    wfr f8, a3
 ; CHECK-NEXT:    wfr f9, a2
-; CHECK-NEXT:    ole.s b0, f8, f9
-; CHECK-NEXT:    bt b0, .LBB10_2
+; CHECK-NEXT:    ult.s b0, f9, f8
+; CHECK-NEXT:    bf b0, .LBB10_2
 ; CHECK-NEXT:  # %bb.1: # %t1
 ; CHECK-NEXT:    l32r a8, .LCPI10_1
 ; CHECK-NEXT:    wfr f8, a8
@@ -359,8 +359,8 @@ define float @brcc_ule(float %a, float %b) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    wfr f8, a3
 ; CHECK-NEXT:    wfr f9, a2
-; CHECK-NEXT:    olt.s b0, f8, f9
-; CHECK-NEXT:    bt b0, .LBB11_2
+; CHECK-NEXT:    ule.s b0, f9, f8
+; CHECK-NEXT:    bf b0, .LBB11_2
 ; CHECK-NEXT:  # %bb.1: # %t1
 ; CHECK-NEXT:    l32r a8, .LCPI11_1
 ; CHECK-NEXT:    wfr f8, a8
@@ -451,6 +451,21 @@ exit:
 }
 
 define float @copysign_f32(float %a, float %b) {
+; CHECK-LABEL: copysign_f32:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    l32r a8, .LCPI14_0
+; CHECK-NEXT:    and a8, a3, a8
+; CHECK-NEXT:    l32r a9, .LCPI14_1
+; CHECK-NEXT:    and a9, a2, a9
+; CHECK-NEXT:    wfr f8, a9
+; CHECK-NEXT:    movi a9, 0
+; CHECK-NEXT:    beq a8, a9, .LBB14_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    neg.s f8, f8
+; CHECK-NEXT:  .LBB14_2: # %entry
+; CHECK-NEXT:    rfr a2, f8
+; CHECK-NEXT:    ret
 entry:
   %c = call float @llvm.copysign.f32(float %a, float %b)
   ret float %c
diff --git a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
index 0f8f505c51a5..5d73b2669ccd 100644
--- a/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
+++ b/llvm/test/DebugInfo/KeyInstructions/Generic/verify.ll
@@ -7,6 +7,8 @@
 
 define dso_local void @f() !dbg !10 {
 entry:
+; Include non-key location to check verifier is checking the whole function.
+  %0 = add i32 0, 0, !dbg !14
   ret void, !dbg !13
 }
 
@@ -20,3 +22,4 @@ entry:
 !11 = !DISubroutineType(types: !12)
 !12 = !{null}
 !13 = !DILocation(line: 1, column: 11, scope: !10, atomGroup: 1, atomRank: 1)
+!14 = !DILocation(line: 1, column: 11, scope: !10)
diff --git a/llvm/test/MC/Hexagon/system-inst.s b/llvm/test/MC/Hexagon/system-inst.s
index 7bc153359853..07f7ca0acb2d 100644
--- a/llvm/test/MC/Hexagon/system-inst.s
+++ b/llvm/test/MC/Hexagon/system-inst.s
@@ -89,6 +89,9 @@ crswap(r12,sgp0)
 #CHECK: 652dc000 { crswap(r13,sgp1) }
 crswap(r13,sgp1)
 
+#CHECK: 6d8ec000 { crswap(r15:14,s1:0) }
+crswap(r15:14,sgp1:0)
+
 #CHECK: 660fc00e { r14 = getimask(r15) }
 r14=getimask(r15)
 
diff --git a/llvm/test/MC/Hexagon/two_ext.s b/llvm/test/MC/Hexagon/two_ext.s
index 28b2aa3f1eca..09b51c5f029a 100644
--- a/llvm/test/MC/Hexagon/two_ext.s
+++ b/llvm/test/MC/Hexagon/two_ext.s
@@ -6,7 +6,7 @@
   if (!p1) call foo_b
 }
 # CHECK: 00004000 { immext(#0)
-# CHECK: 5d004100   if (p1) call 0x0
+# CHECK: 5d004100   if (p1) call 0x0 <.text>
 # CHECK: 00004000   immext(#0)
-# CHECK: 5d20c100   if (!p1) call 0x0 }
+# CHECK: 5d20c100   if (!p1) call 0x0 <.text> }
 
diff --git a/llvm/test/MC/RISCV/Relocations/mc-dump.s b/llvm/test/MC/RISCV/Relocations/mc-dump.s
index 24f3e67ebbdd..c646e6f54b26 100644
--- a/llvm/test/MC/RISCV/Relocations/mc-dump.s
+++ b/llvm/test/MC/RISCV/Relocations/mc-dump.s
@@ -2,12 +2,12 @@
 # RUN: llvm-mc -filetype=obj --triple=riscv64 --mattr=+relax %s -debug-only=mc-dump -o /dev/null 2>&1 | FileCheck %s
 
 #      CHECK:Sections:[
-# CHECK-NEXT:MCSection Name:.text
+# CHECK-NEXT:MCSection Name:.text LinkerRelaxable
 # CHECK-NEXT:0 Data Size:0 []
 # CHECK-NEXT:  Symbol @0 .text
 # CHECK-NEXT:0 Align Align:4 Fill:0 FillLen:1 MaxBytesToEmit:4 Nops
 # CHECK-NEXT:0 Data LinkerRelaxable Size:8 [97,00,00,00,e7,80,00,00]
-# CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023
+# CHECK-NEXT:  Fixup @0 Value:specifier(19,ext) Kind:4023 LinkerRelaxable
 # CHECK-NEXT:  Symbol @0 $x
 # CHECK-NEXT:8 Align Align:8 Fill:0 FillLen:1 MaxBytesToEmit:8 Nops
 # CHECK-NEXT:12 Data Size:4 [13,05,30,00]
diff --git a/llvm/test/MC/RISCV/xqcibi-linker-relaxation.s b/llvm/test/MC/RISCV/xqcibi-linker-relaxation.s
new file mode 100644
index 000000000000..b12585265bf1
--- /dev/null
+++ b/llvm/test/MC/RISCV/xqcibi-linker-relaxation.s
@@ -0,0 +1,84 @@
+# RUN: llvm-mc --triple=riscv32 -mattr=+relax,+experimental-xqcilb,+experimental-xqcibi \
+# RUN:    %s -filetype=obj -o - -riscv-add-build-attributes \
+# RUN:    | llvm-objdump -dr -M no-aliases - \
+# RUN:    | FileCheck %s
+
+## This tests that we correctly emit relocations for linker relaxation when
+## relaxing `JAL` to `QC.E.JAL`.
+
+## PR150071
+
+.global foo
+
+# CHECK-LABEL: <branch_over_relaxable>:
+branch_over_relaxable:
+  jal x1, foo
+# CHECK: qc.e.jal 0x0 <branch_over_relaxable>
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM
+# CHECK-NEXT: R_RISCV_CUSTOM195 foo
+# CHECK-NEXT: R_RISCV_RELAX *ABS*
+  bne a0, a1, branch_over_relaxable
+# CHECK-NEXT: bne a0, a1, 0x6 <branch_over_relaxable+0x6>
+# CHECK-NEXT: R_RISCV_BRANCH branch_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  qc.e.bnei a0, 0x21, branch_over_relaxable
+# CHECK-NEXT: qc.e.bnei a0, 0x21, 0xa <branch_over_relaxable+0xa>
+# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM
+# CHECK-NEXT: R_RISCV_CUSTOM193 branch_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <short_jump_over_fixed>:
+short_jump_over_fixed:
+  nop
+# CHECK: c.nop
+  j short_jump_over_fixed
+# CHECK-NEXT: c.j 0x14 <short_jump_over_fixed+0x2>
+# CHECK-NEXT: R_RISCV_RVC_JUMP short_jump_over_fixed
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <short_jump_over_relaxable>:
+short_jump_over_relaxable:
+  call foo
+# CHECK: auipc ra, 0x0
+# CHECK-NEXT: R_RISCV_CALL_PLT foo
+# CHECK-NEXT: R_RISCV_RELAX *ABS*
+# CHECK-NEXT: jalr ra, 0x0(ra) <short_jump_over_relaxable>
+  j short_jump_over_relaxable
+# CHECK-NEXT: c.j 0x20 <short_jump_over_relaxable+0x8>
+# CHECK-NEXT: R_RISCV_RVC_JUMP short_jump_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <mid_jump_over_fixed>:
+mid_jump_over_fixed:
+  nop
+# CHECK: c.nop
+  .space 0x1000
+# CHECK-NEXT: ...
+  j mid_jump_over_fixed
+# CHECK-NEXT: jal zero, 0x1026 <mid_jump_over_fixed+0x1002>
+# CHECK-NEXT: R_RISCV_JAL mid_jump_over_fixed
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
+
+# CHECK-LABEL: <mid_jump_over_relaxable>:
+mid_jump_over_relaxable:
+  call foo
+# CHECK: auipc ra, 0x0
+# CHECK-NEXT: R_RISCV_CALL_PLT foo
+# CHECK-NEXT: R_RISCV_RELAX *ABS*
+# CHECK-NEXT: jalr ra, 0x0(ra) <mid_jump_over_relaxable>
+  .space 0x1000
+# CHECK-NEXT: ...
+  j mid_jump_over_relaxable
+# CHECK-NEXT: jal zero, 0x2034 <mid_jump_over_relaxable+0x1008>
+# CHECK-NEXT: R_RISCV_JAL mid_jump_over_relaxable
+# CHECK-NOT: R_RISCV_RELAX
+  ret
+# CHECK-NEXT: c.jr ra
diff --git a/llvm/test/MC/X86/intel-syntax-parentheses.s b/llvm/test/MC/X86/intel-syntax-parentheses.s
new file mode 100644
index 000000000000..ae53f6408907
--- /dev/null
+++ b/llvm/test/MC/X86/intel-syntax-parentheses.s
@@ -0,0 +1,10 @@
+// RUN: not llvm-mc -triple x86_64-unknown-unknown %s 2>&1 | FileCheck %s
+
+.intel_syntax
+
+// CHECK: error: invalid base+index expression
+    lea     rdi, [(label + rsi) + rip]
+// CHECK: leaq    1(%rax,%rdi), %rdi
+    lea     rdi, [(rax + rdi) + 1]
+label:
+    .quad 42
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
index 7abc32e4f1cd..f53127f01539 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/fold-loop-of-urem.ll
@@ -1065,3 +1065,37 @@ for.body:
   %exitcond.not = icmp eq i32 %inc, %N
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
+
+define i64 @pr150611_add_offset_is_not_loop_invariant(i1 %cond) {
+; CHECK-LABEL: define i64 @pr150611_add_offset_is_not_loop_invariant(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[REMAMT:%.*]] = select i1 [[COND]], i64 2, i64 0
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD_OFFSET:%.*]] = zext i1 [[COND]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add nuw i64 [[INDVARS]], [[ADD_OFFSET]]
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[ADD]], [[REMAMT]]
+; CHECK-NEXT:    [[INDVARS_NEXT]] = add nuw i64 [[INDVARS]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_NEXT]], 3
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_EXIT:.*]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_EXIT]]:
+; CHECK-NEXT:    ret i64 [[REM]]
+;
+entry:
+  %remamt = select i1 %cond, i64 2, i64 0
+  br label %for.body
+
+for.body:
+  %indvars = phi i64 [ 0, %entry ], [ %indvars.next, %for.body ]
+  %add.offset = zext i1 %cond to i64
+  %add = add nuw i64 %indvars, %add.offset
+  %rem = urem i64 %add, %remamt
+  %indvars.next = add nuw i64 %indvars, 1
+  %exitcond = icmp eq i64 %indvars.next, 3
+  br i1 %exitcond, label %for.exit, label %for.body
+
+for.exit:
+  ret i64 %rem
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
index e390d4bdca63..303afc207c02 100644
--- a/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/zeroed-missing.ll
@@ -12,6 +12,6 @@ define ptr @undeclared_customalloc(i64 %size, i64 %align) {
   ret ptr %call
 }
 
-declare ptr @customalloc2(i64, i64) allockind("alloc") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
+declare ptr @customalloc2(i64, i64) allockind("alloc,uninitialized") "alloc-family"="customalloc2" "alloc-variant-zeroed"="customalloc2_zeroed"
 ; CHECK-DAG: declare ptr @customalloc2_zeroed(i64, i64) #[[CA2ATTR:[0-9]+]]
 ; CHECK-DAG: attributes #[[CA2ATTR]] = { allockind("alloc,zeroed") "alloc-family"="customalloc2" }
diff --git a/llvm/test/Transforms/GVN/cond_br.ll b/llvm/test/Transforms/GVN/cond_br.ll
index 19166d17a832..fb84b626c745 100644
--- a/llvm/test/Transforms/GVN/cond_br.ll
+++ b/llvm/test/Transforms/GVN/cond_br.ll
@@ -53,3 +53,22 @@ if.end:                                           ; preds = %if.else, %if.then
 }
 
 declare void @bar(i32)
+
+define void @indirectbr_could_not_split() {
+; CHECK-LABEL: define void @indirectbr_could_not_split() {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 false, label %[[IBR:.*]], label %[[EXIT:.*]]
+; CHECK:       [[IBR]]:
+; CHECK-NEXT:    indirectbr ptr null, [label %[[EXIT]], label %exit]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 false, label %ibr, label %exit
+
+ibr:
+  indirectbr ptr null, [label %exit, label %exit]
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll b/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll
index b5c4f42655bb..71b463b2d2b0 100644
--- a/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll
+++ b/llvm/test/Transforms/Inline/SystemZ/inline-target-attr.ll
@@ -12,28 +12,28 @@ entry:
 
 declare i32 @baz(...) #0
 
-define i32 @bar() #1 {
+define i32 @features_subset() #1 {
 entry:
   %call = call i32 @foo()
   ret i32 %call
-; CHECK-LABEL: bar
-; CHECK: call i32 @foo()
+; CHECK-LABEL: features_subset
+; CHECK: call i32 (...) @baz()
 }
 
-define i32 @qux() #0 {
+define i32 @features_equal() #0 {
 entry:
   %call = call i32 @foo()
   ret i32 %call
-; CHECK-LABEL: qux
+; CHECK-LABEL: features_equal
 ; CHECK: call i32 (...) @baz()
 }
 
-define i32 @quux() #2 {
+define i32 @features_different() #2 {
 entry:
-  %call = call i32 @bar()
+  %call = call i32 @foo()
   ret i32 %call
-; CHECK-LABEL: quux
-; CHECK: call i32 @bar()
+; CHECK-LABEL: features_different
+; CHECK: call i32 @foo()
 }
 
 
diff --git a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
index ab4448b460bf..820fff433e9e 100644
--- a/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
+++ b/llvm/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -213,7 +213,7 @@ define double @fmul_nnan_ninf_nneg_n0.0_commute(i127 %x) {
 
 define float @fmul_ninf_nnan_mul_zero_nsz(float nofpclass(inf nan) %f) {
 ; CHECK-LABEL: @fmul_ninf_nnan_mul_zero_nsz(
-; CHECK-NEXT:     ret float 0.000000e+00
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %r = fmul nsz float %f, 0.0
   ret float %r
@@ -221,7 +221,7 @@ define float @fmul_ninf_nnan_mul_zero_nsz(float nofpclass(inf nan) %f) {
 
 define float @fmul_ninf_nnan_mul_nzero_nsz(float nofpclass(inf nan) %f) {
 ; CHECK-LABEL: @fmul_ninf_nnan_mul_nzero_nsz(
-; CHECK-NEXT:     ret float 0.000000e+00
+; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %r = fmul nsz float %f, -0.0
   ret float %r
@@ -1255,3 +1255,20 @@ define i1 @fptrunc_round_unknown_positive(double %unknown) {
   %cmp = fcmp nnan oge float %op, 0.0
   ret i1 %cmp
 }
+
+define half @fabs_select_fabs(half noundef %x) {
+; CHECK-LABEL: @fabs_select_fabs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS1:%.*]] = call half @llvm.fabs.f16(half [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt half [[ABS1]], 0xH0000
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], half [[X]], half 0xH0000
+; CHECK-NEXT:    [[ABS2:%.*]] = call half @llvm.fabs.f16(half [[SEL]])
+; CHECK-NEXT:    ret half [[ABS2]]
+;
+entry:
+  %abs1 = call half @llvm.fabs.f16(half %x)
+  %cmp = fcmp ogt half %abs1, 0xH0000
+  %sel = select i1 %cmp, half %x, half 0xH0000
+  %abs2 = call half @llvm.fabs.f16(half %sel)
+  ret half %abs2
+}
diff --git a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
index 98b89abfeafd..057a60918ecf 100644
--- a/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
+++ b/llvm/test/Transforms/LoopDistribute/scev-inserted-runtime-check.ll
@@ -10,8 +10,13 @@ define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, p
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
 ; CHECK:       for.body.lver.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP2]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[MUL_OVERFLOW]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
 ; CHECK:       for.body.ph.lver.orig:
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; CHECK:       for.body.lver.orig:
@@ -75,7 +80,7 @@ define void @f(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d, p
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT2:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end.loopexit1:
+; CHECK:       for.end.loopexit2:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
@@ -135,8 +140,13 @@ define void @f_with_offset(ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr n
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
 ; CHECK:       for.body.lver.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP2]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
-; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i1 [[MUL_OVERFLOW]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
 ; CHECK:       for.body.ph.lver.orig:
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
 ; CHECK:       for.body.lver.orig:
@@ -200,7 +210,7 @@ define void @f_with_offset(ptr noalias %b, ptr noalias %c, ptr noalias %d, ptr n
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT2:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END:%.*]]
-; CHECK:       for.end.loopexit1:
+; CHECK:       for.end.loopexit2:
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
new file mode 100644
index 000000000000..dd524ab7d140
--- /dev/null
+++ b/llvm/test/Transforms/LoopSimplifyCFG/enter-through-indirectbr.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -enable-loop-simplifycfg-term-folding=true -passes='require<domtree>,loop(loop-simplifycfg)' -verify-loop-info -verify-dom-info -verify-loop-lcssa < %s | FileCheck %s
+
+define void @test(ptr %addr) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[ADDR:%.*]]) {
+; CHECK-NEXT:    indirectbr ptr [[ADDR]], [label %[[A:.*]], label %C]
+; CHECK:       [[A]]:
+; CHECK-NEXT:    br i1 true, label %[[B:.*]], label %[[C_LOOPEXIT:.*]]
+; CHECK:       [[B]]:
+; CHECK-NEXT:    br i1 true, label %[[A]], label %[[C_LOOPEXIT]]
+; CHECK:       [[C_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[C:.*]]
+; CHECK:       [[C]]:
+; CHECK-NEXT:    unreachable
+;
+
+  indirectbr ptr %addr, [label %A, label %C]
+
+A:
+  br i1 true, label %B, label %C
+
+B:
+  br i1 true, label %A, label %C
+
+C:
+  unreachable
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
index 451574a258c2..427a05cc1c84 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
index e93ee5563b05..1a8e5940d88e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -42,18 +42,59 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index 544ef5c82c7a..88590fa443bd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -1,8 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "(:|sincos|modf|extractvalue|store)" --version 5
 ; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve < %s -S -o - -debug-only=loop-vectorize 2>%t.1 | FileCheck %s --check-prefix=CHECK
 ; RUN: opt -passes=loop-vectorize -mtriple=aarch64-gnu-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.2 | FileCheck %s --check-prefix=CHECK-ARMPL
+; RUN: opt -passes=loop-vectorize -mtriple=aarch64-amazon-linux -mcpu=neoverse-v1 -mattr=+sve -vector-library=ArmPL < %s -S -o - -debug-only=loop-vectorize 2>%t.3 | FileCheck %s --check-prefix=CHECK-ARMPL
 ; RUN: FileCheck --input-file=%t.1 --check-prefix=CHECK-COST %s
 ; RUN: FileCheck --input-file=%t.2 --check-prefix=CHECK-COST-ARMPL %s
+; Check we vectorize the functions with the vector-library on aarch64-amazon-linux:
+; RUN: FileCheck --input-file=%t.3 --check-prefix=CHECK-COST-ARMPL %s
 ; REQUIRES: asserts
 
 ; CHECK-COST-LABEL: sincos_f32
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
index 51e24924cae7..7a7e5a1a9b24 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a510 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA510
 ; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a520 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA520
+; NOTE: Downstream issue: #482 (Use FeatureUseFixedOverScalableIfEqualCost for A320)
+; RUN: opt < %s -mtriple=aarch64-none-elf -mcpu=cortex-a320 -mattr=+sve -passes=loop-vectorize -S | FileCheck %s --check-prefix=CHECK-CA320
 
 define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA510-LABEL: define void @sve_add(
@@ -137,6 +139,74 @@ define void @sve_add(ptr  %dst, ptr  %a, ptr  %b, i64 %n) {
 ; CHECK-CA520:       [[FOR_COND_CLEANUP]]:
 ; CHECK-CA520-NEXT:    ret void
 ;
+; NOTE: Downstream issue: #482 (Use FeatureUseFixedOverScalableIfEqualCost for A320)
+; CHECK-CA320-LABEL: define void @sve_add(
+; CHECK-CA320-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-CA320-NEXT:  [[ENTRY:.*:]]
+; CHECK-CA320-NEXT:    [[B3:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-CA320-NEXT:    [[A2:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-CA320-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-CA320-NEXT:    [[CMP9_NOT:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-CA320-NEXT:    br i1 [[CMP9_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK-CA320:       [[FOR_BODY_PREHEADER]]:
+; CHECK-CA320-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-CA320-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-CA320:       [[VECTOR_MEMCHECK]]:
+; CHECK-CA320-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[A2]]
+; CHECK-CA320-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
+; CHECK-CA320-NEXT:    [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]]
+; CHECK-CA320-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32
+; CHECK-CA320-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
+; CHECK-CA320-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK-CA320:       [[VECTOR_PH]]:
+; CHECK-CA320-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-CA320-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-CA320-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-CA320:       [[VECTOR_BODY]]:
+; CHECK-CA320-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-CA320-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 0
+; CHECK-CA320-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i32 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP13]], align 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
+; CHECK-CA320-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 0
+; CHECK-CA320-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i32 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP14]], align 4
+; CHECK-CA320-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP5]], align 4
+; CHECK-CA320-NEXT:    [[TMP6:%.*]] = fadd fast <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD]]
+; CHECK-CA320-NEXT:    [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD5]]
+; CHECK-CA320-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDEX]]
+; CHECK-CA320-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 0
+; CHECK-CA320-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4
+; CHECK-CA320-NEXT:    store <4 x float> [[TMP6]], ptr [[TMP15]], align 4
+; CHECK-CA320-NEXT:    store <4 x float> [[TMP7]], ptr [[TMP9]], align 4
+; CHECK-CA320-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-CA320-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-CA320-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-CA320:       [[MIDDLE_BLOCK]]:
+; CHECK-CA320-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-CA320-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-CA320:       [[SCALAR_PH]]:
+; CHECK-CA320-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-CA320-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-CA320:       [[FOR_BODY]]:
+; CHECK-CA320-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-CA320-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    [[TMP11:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-CA320-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    [[TMP12:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
+; CHECK-CA320-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP12]], [[TMP11]]
+; CHECK-CA320-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds nuw float, ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-CA320-NEXT:    store float [[ADD]], ptr [[ARRAYIDX4]], align 4
+; CHECK-CA320-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-CA320-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
+; CHECK-CA320-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-CA320:       [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-CA320-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK-CA320:       [[FOR_COND_CLEANUP]]:
+; CHECK-CA320-NEXT:    ret void
+;
 entry:
   %cmp9.not = icmp eq i64 %n, 0
   br i1 %cmp9.not, label %for.cond.cleanup, label %for.body
@@ -166,3 +236,9 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 ; CHECK-CA520: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-CA520: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
 ;.
+; NOTE: Downstream issue: #482 (Use FeatureUseFixedOverScalableIfEqualCost for A320).
+; CHECK-CA320: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-CA320: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-CA320: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-CA320: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 147e949808b5..c1566c4dd32b 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -1045,8 +1045,8 @@ define i64 @live_in_known_1_via_scev() {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 3, i64 1, i64 1, i64 1>, [[VECTOR_PH]] ], [ [[VEC_PHI]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
-; CHECK-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
+; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> [[VEC_PHI]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
@@ -1213,6 +1213,7 @@ define i32 @g(i64 %n) {
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[N]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[N]], 4294967295
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
index b2e080fef2e5..a2eddad17921 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -42,18 +42,59 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI1]], <4 x float> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP6]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[TMP11]], <4 x float> [[TMP12]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i1> [[TMP15]], i32 0
+; CHECK-NEXT:    [[TMP17:%.*]] = and i1 [[CMP_N]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[TMP17]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP13]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP13]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
index 5661406b88a5..1ca5586942d7 100644
--- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fmaxnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,234 @@ define float @fmaxnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fmaxnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_10(ptr %src, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_10(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -10
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 10, [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 10, [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 10, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 10, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_induction_starts_at_value(ptr %src, i64 %start, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_induction_starts_at_value(
+; CHECK-SAME: ptr [[SRC:%.*]], i64 [[START:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[N]], [[START]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[START]], [[INDEX]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP6]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP6]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP6]], i64 [[INDEX]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP8]])
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[START]], [[TMP9]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <4 x i1> [[TMP12]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[CMP_N]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %start, %entry ], [ %iv.next, %loop ]
+  %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
+  %l = load float, ptr %gep.src, align 4
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret float %max.next
+}
+
+define float @fmaxnum_with_additional_add(ptr noalias %src, ptr noalias %src.2, i64 %n) {
+; CHECK-LABEL: define float @fmaxnum_with_additional_add(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUM_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr inbounds nuw i32, ptr [[SRC_2]], i64 [[IV]]
+; CHECK-NEXT:    [[L_SRC_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[SUM_NEXT]] = add i32 [[SUM]], [[L_SRC_2]]
 ; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
-; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[MAX]], float [[L]])
+; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.maxnum.f32(float [[L]], float [[MAX]])
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
 ; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    store i32 [[SUM_NEXT_LCSSA]], ptr [[SRC_2]], align 4
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -247,14 +496,19 @@ entry:
 loop:
   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
   %max = phi float [ -1.000000e+07, %entry ], [ %max.next, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %sum.next, %loop ]
+  %gep.src.2 = getelementptr inbounds nuw i32, ptr %src.2, i64 %iv
+  %l.src.2 = load i32, ptr %gep.src.2, align 4
+  %sum.next = add i32 %sum, %l.src.2
   %gep.src = getelementptr inbounds nuw float, ptr %src, i64 %iv
   %l = load float, ptr %gep.src, align 4
-  %max.next = call float @llvm.maxnum.f32(float %max, float %l)
+  %max.next = call float @llvm.maxnum.f32(float %l, float %max)
   %iv.next = add nuw nsw i64 %iv, 1
   %ec = icmp eq i64 %iv.next, %n
   br i1 %ec, label %exit, label %loop
 
 exit:
+  store i32 %sum.next, ptr %src.2
   ret float %max.next
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
index 148beb64a360..68bc8d0640a3 100644
--- a/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
+++ b/llvm/test/Transforms/LoopVectorize/fmin-without-fast-math-flags.ll
@@ -192,18 +192,51 @@ define float @fminnum_1(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_1(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[VEC_PHI]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[L]], float [[MAX]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
@@ -227,18 +260,51 @@ define float @fminnum_2(ptr %src, i64 %n) {
 ; CHECK-LABEL: define float @fminnum_2(
 ; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ splat (float -1.000000e+07), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw float, ptr [[GEP_SRC]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP4]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP2]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[TMP3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP3]], [[TMP5]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x float> [[VEC_PHI]], <4 x float> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP3]], i64 [[IV]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP7]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[CMP_N]], [[TMP11]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ -1.000000e+07, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[MAX:%.*]] = phi float [ -1.000000e+07, %[[ENTRY]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV]]
-; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[MAX_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_SRC1:%.*]] = getelementptr inbounds nuw float, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[GEP_SRC1]], align 4
 ; CHECK-NEXT:    [[MAX_NEXT]] = call float @llvm.minnum.f32(float [[MAX]], float [[L]])
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    [[MAX_NEXT_LCSSA:%.*]] = phi float [ [[MAX_NEXT]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[MAX_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll
deleted file mode 100644
index 661e8eb666d5..000000000000
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-3.ll
+++ /dev/null
@@ -1,124 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Check that the interleaved-mem-access analysis currently does not create an
-; interleave group for access 'a' due to the possible pointer wrap-around.
-;
-; To begin with, in this test the candidate interleave group can be created
-; only when getPtrStride is called with Assume=true. Next, because
-; the interleave-group of the loads is not full (has gaps), we also need to check
-; for possible pointer wrapping. Here we currently use Assume=false and as a
-; result cannot prove the transformation is safe and therefore invalidate the
-; candidate interleave group.
-;
-
-; void func(unsigned * __restrict a, unsigned * __restrict b, unsigned char x, unsigned char y) {
-;  int i = 0;
-;  for (unsigned char index = x; i < y; index +=2, ++i)
-;    b[i] = aptr 2;
-;
-; }
-
-define void @_Z4funcPjS_hh(ptr noalias nocapture readonly %a, ptr noalias nocapture %b, i8 zeroext %x, i8 zeroext %y) local_unnamed_addr {
-; CHECK-LABEL: define void @_Z4funcPjS_hh(
-; CHECK-SAME: ptr noalias readonly captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]], i8 zeroext [[X:%.*]], i8 zeroext [[Y:%.*]]) local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[CMP9:%.*]] = icmp eq i8 [[Y]], 0
-; CHECK-NEXT:    br i1 [[CMP9]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
-; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i8 [[Y]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i8 [[Y]], 5
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
-; CHECK:       [[VECTOR_SCEVCHECK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i8
-; CHECK-NEXT:    [[MUL_RESULT:%.*]] = shl i8 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i8 [[X]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i8 [[MUL_RESULT]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP0]], 127
-; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    br i1 [[TMP5]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
-; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP8]], i64 4, i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[WIDE_TRIP_COUNT]], [[TMP7]]
-; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i8 [[DOTCAST]], 1
-; CHECK-NEXT:    [[IND_END:%.*]] = add i8 [[X]], [[TMP6]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[DOTCAST3:%.*]] = trunc i64 [[INDEX]] to i8
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i8 [[DOTCAST3]], 1
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[X]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = zext i8 [[OFFSET_IDX]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP15]], align 4
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP24:%.*]] = shl <4 x i32> [[TMP23]], splat (i32 1)
-; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    store <4 x i32> [[TMP24]], ptr [[TMP25]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    br label %[[SCALAR_PH]]
-; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[FOR_BODY_PREHEADER]] ], [ [[X]], %[[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
-; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
-; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
-; CHECK:       [[FOR_COND_CLEANUP]]:
-; CHECK-NEXT:    ret void
-; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INDEX_011:%.*]] = phi i8 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i8 [[INDEX_011]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[TMP27]], 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[ADD]] = add i8 [[INDEX_011]], 2
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-;
-entry:
-  %cmp9 = icmp eq i8 %y, 0
-  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
-
-for.body.preheader:
-  %wide.trip.count = zext i8 %y to i64
-  br label %for.body
-
-for.cond.cleanup.loopexit:
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-
-for.body:
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
-  %index.011 = phi i8 [ %add, %for.body ], [ %x, %for.body.preheader ]
-  %idxprom = zext i8 %index.011 to i64
-  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
-  %0 = load i32, ptr %arrayidx, align 4
-  %mul = shl i32 %0, 1
-  %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
-  store i32 %mul, ptr %arrayidx2, align 4
-  %add = add i8 %index.011, 2
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
-;.
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
-;.
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-requiring-scev-predicates.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-requiring-scev-predicates.ll
new file mode 100644
index 000000000000..38438941eb35
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-requiring-scev-predicates.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -S -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the interleaved-mem-access analysis currently does not create an
+; interleave group for access 'a' due to the possible pointer wrap-around.
+;
+; To begin with, in this test the candidate interleave group can be created
+; only when getPtrStride is called with Assume=true. Next, because
+; the interleave-group of the loads is not full (has gaps), we also need to check
+; for possible pointer wrapping. Here we currently use Assume=false and as a
+; result cannot prove the transformation is safe and therefore invalidate the
+; candidate interleave group.
+;
+
+; void func(unsigned * __restrict a, unsigned * __restrict b, unsigned char x, unsigned char y) {
+;  int i = 0;
+;  for (unsigned char index = x; i < y; index +=2, ++i)
+;    b[i] = aptr 2;
+;
+; }
+
+define void @wrap_around_scev_check(ptr noalias %a, ptr noalias %b, i8 %x, i8 %y) {
+; CHECK-LABEL: define void @wrap_around_scev_check(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[X:%.*]], i8 [[Y:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp eq i8 [[Y]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label %[[EXIT:.*]], label %[[LOOP_PREHEADER:.*]]
+; CHECK:       [[LOOP_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i8 [[Y]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i8
+; CHECK-NEXT:    [[MUL1:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 2, i8 [[TMP1]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i8, i1 } [[MUL1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i8 [[TMP2]], [[X]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp ugt i64 [[TMP0]], 255
+; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP5]], [[TMP17]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP7]]
+; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i8 [[DOTCAST]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[X]], [[TMP8]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOTCAST2:%.*]] = trunc i64 [[INDEX]] to i8
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i8 [[DOTCAST2]], 2
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[X]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = zext i8 [[OFFSET_IDX]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[STRIDED_VEC]], splat (i32 1)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[LOOP_PREHEADER]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL3:%.*]] = phi i8 [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ [[X]], %[[LOOP_PREHEADER]] ], [ [[X]], %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[INDEX_011:%.*]] = phi i8 [ [[BC_RESUME_VAL3]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i8 [[INDEX_011]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl i32 [[TMP16]], 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD]] = add i8 [[INDEX_011]], 2
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp9 = icmp eq i8 %y, 0
+  br i1 %cmp9, label %exit, label %loop.preheader
+
+loop.preheader:
+  %wide.trip.count = zext i8 %y to i64
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %loop.preheader ], [ %iv.next, %loop ]
+  %index.011 = phi i8 [ %x, %loop.preheader ], [ %add, %loop ]
+  %idxprom = zext i8 %index.011 to i64
+  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %idxprom
+  %0 = load i32, ptr %arrayidx, align 4
+  %mul = shl i32 %0, 1
+  %arrayidx2 = getelementptr inbounds i32, ptr %b, i64 %iv
+  store i32 %mul, ptr %arrayidx2, align 4
+  %add = add i8 %index.011, 2
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %wide.trip.count
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; For %gep, we have the following SCEV: ((4 * (zext i4 {0,+,5}<%loop> to i64))<nuw><nsw> + %x).
+; Note the i4 bit wide AddRec {0,+,5}. It is known to wrap in the loop with trip count 16.
+define void @wrap_predicate_for_interleave_group_wraps_for_known_trip_count(ptr noalias %x, ptr noalias %out) {
+; CHECK-LABEL: define void @wrap_predicate_for_interleave_group_wraps_for_known_trip_count(
+; CHECK-SAME: ptr noalias [[X:%.*]], ptr noalias [[OUT:%.*]]) {
+; CHECK-NEXT:  [[START:.*]]:
+; CHECK-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[MUL:%.*]] = call { i4, i1 } @llvm.umul.with.overflow.i4(i4 5, i4 -1)
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i4, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i4, i1 } [[MUL]], 1
+; CHECK-NEXT:    br i1 [[MUL_OVERFLOW]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 15
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <20 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <20 x i32> [[WIDE_VEC]], <20 x i32> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i32, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[STRIDED_VEC]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 12, %[[MIDDLE_BLOCK]] ], [ 0, %[[START]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_MUL5:%.*]] = mul nuw nsw i64 [[IV]], 5
+; CHECK-NEXT:    [[IV_MUL5_MASKED:%.*]] = and i64 [[IV_MUL5]], 15
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i64 [[IV_MUL5_MASKED]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[OUT_I:%.*]] = getelementptr inbounds nuw i32, ptr [[OUT]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[V]], ptr [[OUT_I]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+start:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %start ], [ %iv.next, %loop ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.mul5 = mul nuw nsw i64 %iv, 5
+  %iv.mul5.masked = and i64 %iv.mul5, 15
+  %gep = getelementptr inbounds nuw i32, ptr %x, i64 %iv.mul5.masked
+  %v = load i32, ptr %gep, align 4
+  %out.i = getelementptr inbounds nuw i32, ptr %out, i64 %iv
+  store i32 %v, ptr %out.i, align 4
+  %exitcond.not = icmp eq i64 %iv.next, 16
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; For %gep, we have the following SCEV: ((4 * (zext i4 {0,+,3}<%loop> to i64))<nuw><nsw> + %x).
+; Note the i4 bit wide AddRec {0,+,3}. It may wrap, depending on the trip count.
+define void @wrap_predicate_for_interleave_group_unknown_trip_count(ptr noalias %x, ptr noalias %out, i64 %n) {
+; CHECK-LABEL: define void @wrap_predicate_for_interleave_group_unknown_trip_count(
+; CHECK-SAME: ptr noalias [[X:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[START:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP0]] to i4
+; CHECK-NEXT:    [[MUL:%.*]] = call { i4, i1 } @llvm.umul.with.overflow.i4(i4 3, i4 [[TMP9]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i4, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i4, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 15
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[MUL_OVERFLOW]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP2]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[TMP7]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw nsw i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP8]], 15
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i64 [[TMP3]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[OUT]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    store <4 x i32> [[STRIDED_VEC]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[START]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[IV_MUL5:%.*]] = mul nuw nsw i64 [[IV]], 3
+; CHECK-NEXT:    [[IV_MUL5_MASKED:%.*]] = and i64 [[IV_MUL5]], 15
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[X]], i64 [[IV_MUL5_MASKED]]
+; CHECK-NEXT:    [[V:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[OUT_I:%.*]] = getelementptr inbounds nuw i32, ptr [[OUT]], i64 [[IV]]
+; CHECK-NEXT:    store i32 [[V]], ptr [[OUT_I]], align 4
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+start:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %start ], [ %iv.next, %loop ]
+  %iv.next = add nuw nsw i64 %iv, 1
+  %iv.mul3 = mul nuw nsw i64 %iv, 3
+  %iv.mul3.masked = and i64 %iv.mul3, 15
+  %gep = getelementptr inbounds nuw i32, ptr %x, i64 %iv.mul3.masked
+  %v = load i32, ptr %gep, align 4
+  %out.i = getelementptr inbounds nuw i32, ptr %out, i64 %iv
+  store i32 %v, ptr %out.i, align 4
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
index 85a90f2e04c5..e7ab02cd98a5 100644
--- a/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -1001,8 +1001,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmin_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.minnum.v2f32
+; CHECK: <2 x float> @llvm.minnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmin_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
@@ -1021,8 +1023,10 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+; This can be vectorized with additional runtime checks for NaNs.
 ; CHECK-LABEL: @fmax_intrinsic_nofast(
-; CHECK-NOT: <2 x float> @llvm.maxnum.v2f32
+; CHECK: <2 x float> @llvm.maxnum.v2f32
+; CHECK: fcmp uno <2 x float> [[OP:.+]], [[OP]]
 define float @fmax_intrinsic_nofast(ptr nocapture readonly %x) {
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
index a47037c46eed..7a5cf4576ac9 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-small-clamped-bounds.ll
@@ -19,6 +19,7 @@ define void @load_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP0]] to i2
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
@@ -91,6 +92,7 @@ define void @store_clamped_index(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP0]] to i2
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
@@ -359,6 +361,7 @@ define void @clamped_index_equal_dependence(ptr %A, ptr %B, i32 %N) {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; CHECK:       vector.scevcheck:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP0]] to i2
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll
index 6e5eeaf9f121..3e2569530485 100644
--- a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll
+++ b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-versioning.ll
@@ -29,7 +29,12 @@ define void @f1(ptr noalias %a,
 ; LV-LABEL: @f1(
 ; LV-NEXT:  for.body.lver.check:
 ; LV-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], -1
+; LV-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP0]] to i32
+; LV-NEXT:    [[MUL2:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 2, i32 [[TMP5]])
+; LV-NEXT:    [[MUL_RESULT1:%.*]] = extractvalue { i32, i1 } [[MUL2]], 0
+; LV-NEXT:    [[MUL_OVERFLOW1:%.*]] = extractvalue { i32, i1 } [[MUL2]], 1
 ; LV-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], 4294967295
+; LV-NEXT:    [[TMP8:%.*]] = or i1 [[MUL_OVERFLOW1]], [[TMP1]]
 ; LV-NEXT:    [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP0]])
 ; LV-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
 ; LV-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
@@ -37,7 +42,7 @@ define void @f1(ptr noalias %a,
 ; LV-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[MUL_RESULT]]
 ; LV-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[A]]
 ; LV-NEXT:    [[TMP6:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; LV-NEXT:    [[TMP7:%.*]] = or i1 [[TMP1]], [[TMP6]]
+; LV-NEXT:    [[TMP7:%.*]] = or i1 [[TMP8]], [[TMP6]]
 ; LV-NEXT:    br i1 [[TMP7]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH:%.*]]
 ; LV:       for.body.ph.lver.orig:
 ; LV-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
@@ -75,7 +80,7 @@ define void @f1(ptr noalias %a,
 ; LV-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT2:%.*]], label [[FOR_BODY]]
 ; LV:       for.end.loopexit:
 ; LV-NEXT:    br label [[FOR_END:%.*]]
-; LV:       for.end.loopexit2:
+; LV:       for.end.loopexit5:
 ; LV-NEXT:    br label [[FOR_END]]
 ; LV:       for.end:
 ; LV-NEXT:    ret void
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-log.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-log.ll
new file mode 100644
index 000000000000..3a1d1324d8e4
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-log.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s
+target triple = "aarch64"
+
+define <vscale x 4 x float> @scalable_vec_log(<vscale x 4 x float> %input) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_vec_log(
+; CHECK-SAME: <vscale x 4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP9:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <vscale x 4 x float> [ [[INPUT]], [[TMP0]] ], [ [[TMP8:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x float> [[TMP5]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.log.f32(float [[TMP6]])
+; CHECK-NEXT:    [[TMP8]] = insertelement <vscale x 4 x float> [[TMP5]], float [[TMP7]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB3]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP8]]
+;
+  %output = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> %input)
+  ret <vscale x 4 x float> %output
+}
+
+define <4 x float> @fixed_vec_log(<4 x float> %input) {
+; CHECK-LABEL: define <4 x float> @fixed_vec_log(
+; CHECK-SAME: <4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[OUTPUT:%.*]] = call <4 x float> @llvm.log.v4f32(<4 x float> [[INPUT]])
+; CHECK-NEXT:    ret <4 x float> [[OUTPUT]]
+;
+  %output = call <4 x float> @llvm.log.v4f32(<4 x float> %input)
+  ret <4 x float> %output
+}
+
+declare <4 x float> @llvm.log.v4f32(<4 x float>) #0
+declare <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
index 5e3fd156666f..410696260a85 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@@ -16,12 +16,11 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, ptr %p) {
 ; CHECK-NEXT:    [[S0:%.*]] = sext i32 [[E0]] to i64
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[S0]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, ptr [[GEP0]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[SUB0]], <4 x i32> poison, <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load i64, ptr [[GEP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[LOAD2:%.*]] = load i64, ptr [[GEP2]], align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
index 2b5ee59aeb16..96dd691c4816 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll
@@ -5,24 +5,21 @@ define i16 @foo(i16 %in1, i16 %in2) {
 ; CHECK-LABEL: define i16 @foo(
 ; CHECK-SAME: i16 [[IN1:%.*]], i16 [[IN2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535)
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533)
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
+; CHECK-NEXT:    [[ZEXT1_1:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_1:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = mul nuw nsw i64 [[ZEXT2_1]], [[ZEXT1_1]]
+; CHECK-NEXT:    [[AND1:%.*]] = and i64 [[TMP10]], 65535
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[AND1]], 65533
 ; CHECK-NEXT:    [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
 ; CHECK-NEXT:    [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605
 ; CHECK-NEXT:    [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
+; CHECK-NEXT:    [[ZEXT1_2:%.*]] = zext i16 [[IN1]] to i64
+; CHECK-NEXT:    [[ZEXT2_2:%.*]] = zext i16 [[IN2]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nuw nsw i64 [[ZEXT2_2]], [[ZEXT1_2]]
+; CHECK-NEXT:    [[AND2:%.*]] = and i64 [[TMP13]], 65535
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i64 [[AND2]], 65533
 ; CHECK-NEXT:    [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
 ; CHECK-NEXT:    [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605
 ; CHECK-NEXT:    [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]]
diff --git a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
index a1f4590a5691..04c69106d97f 100644
--- a/llvm/test/Transforms/SLPVectorizer/extract-subvector-long-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extract-subvector-long-input.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define void @test() {
 ; CHECK-LABEL: define void @test() {
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 667fc41c069e..10a17f7e3f9a 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -645,20 +645,21 @@ define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo)
 ; CHECK-NEXT:    [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
 ; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
 ; CHECK-NEXT:    [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <16 x i8> [[A1]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[A1]], i64 8
 ; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
 ; CHECK-NEXT:    [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
 ; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[E3:%.*]] = extractelement <16 x i8> [[A2]], i64 0
+; CHECK-NEXT:    [[E4:%.*]] = extractelement <16 x i8> [[A2]], i64 8
 ; CHECK-NEXT:    [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
 ; CHECK-NEXT:    [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
 ; CHECK-NEXT:    store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
 ; CHECK-NEXT:    [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O3]], 0
+; CHECK-NEXT:    [[O2:%.*]] = or i8 [[E4]], [[E3]]
+; CHECK-NEXT:    [[O4:%.*]] = or i8 [[O3]], [[O2]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O4]], 0
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %l = load <2 x i64>, ptr %values, align 8
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
index 8d7d31ab9844..f397290299a4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-buildvector-with-minbitwidth-user.ll
@@ -7,54 +7,72 @@ define void @h(i16 %a, i16 %b, i16 %c, i16 %d, i16 %e, i16 %f, i16 %g, i16 %h, i
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV9:%.*]] = zext i16 [[A]] to i32
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr null, i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[E]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[I]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[M]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> poison, i16 [[G]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[K]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[O]], i32 2
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i16> [[TMP8]], i16 [[C]], i32 3
+; CHECK-NEXT:    [[CONV310:%.*]] = zext i16 [[B]] to i32
+; CHECK-NEXT:    [[ADD4:%.*]] = or i32 [[CONV310]], [[CONV9]]
+; CHECK-NEXT:    [[SUB:%.*]] = or i32 [[CONV9]], [[CONV310]]
+; CHECK-NEXT:    [[CONV15:%.*]] = sext i16 [[C]] to i32
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 0, 0
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i8, ptr null, i64 24
 ; CHECK-NEXT:    [[CONV19:%.*]] = sext i16 [[D]] to i32
 ; CHECK-NEXT:    [[SUB20:%.*]] = or i32 [[SHR]], [[CONV19]]
+; CHECK-NEXT:    [[SHR29:%.*]] = ashr i32 0, 0
+; CHECK-NEXT:    [[ADD30:%.*]] = or i32 [[SHR29]], [[CONV15]]
 ; CHECK-NEXT:    [[SUB39:%.*]] = or i32 [[SUB]], [[SUB20]]
 ; CHECK-NEXT:    [[CONV40:%.*]] = trunc i32 [[SUB39]] to i16
 ; CHECK-NEXT:    store i16 [[CONV40]], ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[SUB44:%.*]] = or i32 [[ADD4]], [[ADD30]]
+; CHECK-NEXT:    [[CONV45:%.*]] = trunc i32 [[SUB44]] to i16
+; CHECK-NEXT:    store i16 [[CONV45]], ptr [[ARRAYIDX18]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr i8, ptr null, i64 18
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i16> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = zext i16 [[TMP10]] to i32
-; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[TMP11]], 0
+; CHECK-NEXT:    [[CONV3_112:%.*]] = zext i16 [[E]] to i32
+; CHECK-NEXT:    [[ADD4_1:%.*]] = or i32 [[CONV3_112]], 0
+; CHECK-NEXT:    [[SUB_1:%.*]] = or i32 0, [[CONV3_112]]
 ; CHECK-NEXT:    [[CONV15_1:%.*]] = sext i16 [[F]] to i32
+; CHECK-NEXT:    [[SHR_1:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_1:%.*]] = getelementptr i8, ptr null, i64 26
+; CHECK-NEXT:    [[CONV19_1:%.*]] = sext i16 [[G]] to i32
+; CHECK-NEXT:    [[SUB20_1:%.*]] = or i32 [[SHR_1]], [[CONV19_1]]
 ; CHECK-NEXT:    [[SHR29_1:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_1:%.*]] = or i32 [[SHR29_1]], [[CONV15_1]]
+; CHECK-NEXT:    [[SUB39_1:%.*]] = or i32 [[SUB_1]], [[SUB20_1]]
+; CHECK-NEXT:    [[CONV40_1:%.*]] = trunc i32 [[SUB39_1]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_1]], ptr [[ARRAYIDX2_1]], align 2
 ; CHECK-NEXT:    [[SUB44_1:%.*]] = or i32 [[ADD4_1]], [[ADD30_1]]
 ; CHECK-NEXT:    [[CONV45_1:%.*]] = trunc i32 [[SUB44_1]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_1]], ptr [[ARRAYIDX18_1]], align 2
 ; CHECK-NEXT:    [[CONV_213:%.*]] = zext i16 [[H]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr i8, ptr null, i64 20
+; CHECK-NEXT:    [[CONV3_214:%.*]] = zext i16 [[I]] to i32
 ; CHECK-NEXT:    [[ADD4_2:%.*]] = or i32 0, [[CONV_213]]
+; CHECK-NEXT:    [[SUB_2:%.*]] = or i32 0, [[CONV3_214]]
 ; CHECK-NEXT:    [[CONV15_2:%.*]] = sext i16 [[J]] to i32
+; CHECK-NEXT:    [[SHR_2:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_2:%.*]] = getelementptr i8, ptr null, i64 28
+; CHECK-NEXT:    [[CONV19_2:%.*]] = sext i16 [[K]] to i32
+; CHECK-NEXT:    [[SUB20_2:%.*]] = or i32 [[SHR_2]], [[CONV19_2]]
 ; CHECK-NEXT:    [[SHR29_2:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_2:%.*]] = or i32 [[SHR29_2]], [[CONV15_2]]
+; CHECK-NEXT:    [[SUB39_2:%.*]] = or i32 [[SUB_2]], [[SUB20_2]]
+; CHECK-NEXT:    [[CONV40_2:%.*]] = trunc i32 [[SUB39_2]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_2]], ptr [[ARRAYIDX2_2]], align 2
 ; CHECK-NEXT:    [[SUB44_2:%.*]] = or i32 [[ADD4_2]], [[ADD30_2]]
 ; CHECK-NEXT:    [[CONV45_2:%.*]] = trunc i32 [[SUB44_2]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_2]], ptr [[ARRAYIDX18_2]], align 2
 ; CHECK-NEXT:    [[CONV_315:%.*]] = zext i16 [[L]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2_3:%.*]] = getelementptr i8, ptr null, i64 22
+; CHECK-NEXT:    [[CONV3_316:%.*]] = zext i16 [[M]] to i32
 ; CHECK-NEXT:    [[ADD4_3:%.*]] = or i32 0, [[CONV_315]]
+; CHECK-NEXT:    [[SUB_3:%.*]] = or i32 0, [[CONV3_316]]
 ; CHECK-NEXT:    [[CONV15_3:%.*]] = sext i16 [[N]] to i32
+; CHECK-NEXT:    [[SHR_3:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ARRAYIDX18_3:%.*]] = getelementptr i8, ptr null, i64 30
+; CHECK-NEXT:    [[CONV19_3:%.*]] = sext i16 [[O]] to i32
+; CHECK-NEXT:    [[SUB20_3:%.*]] = or i32 [[SHR_3]], [[CONV19_3]]
 ; CHECK-NEXT:    [[SHR29_3:%.*]] = ashr i32 0, 0
 ; CHECK-NEXT:    [[ADD30_3:%.*]] = or i32 [[SHR29_3]], [[CONV15_3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> <i16 0, i16 0, i16 0, i16 poison>, i16 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i16> [[TMP3]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <4 x i16> zeroinitializer, [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i16> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    store <4 x i16> [[TMP15]], ptr [[ARRAYIDX2_1]], align 2
+; CHECK-NEXT:    [[SUB39_3:%.*]] = or i32 [[SUB_3]], [[SUB20_3]]
+; CHECK-NEXT:    [[CONV40_3:%.*]] = trunc i32 [[SUB39_3]] to i16
+; CHECK-NEXT:    store i16 [[CONV40_3]], ptr [[ARRAYIDX2_3]], align 2
 ; CHECK-NEXT:    [[SUB44_3:%.*]] = or i32 [[ADD4_3]], [[ADD30_3]]
 ; CHECK-NEXT:    [[CONV45_3:%.*]] = trunc i32 [[SUB44_3]] to i16
 ; CHECK-NEXT:    store i16 [[CONV45_3]], ptr [[ARRAYIDX18_3]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index fcbe2d631ba8..3376fb8910b7 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
-; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s
+; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer -slp-threshold=-12 -pass-remarks-output=%t < %s | FileCheck %s
 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
 
 ; These tests check that we remove from consideration pairs of seed
@@ -26,7 +26,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '4'
+; YAML-NEXT:   - Cost:            '6'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -36,7 +36,7 @@
 ; YAML-NEXT: Function:        getelementptr_4x32
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '6'
+; YAML-NEXT:   - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -51,35 +51,39 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD16:%.*]] = extractelement <2 x i32> [[TMP17:%.*]], i32 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD16]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP15]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP12]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP7]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP16]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP13]]
 ; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
 ; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i32> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[ADD16]] = add nsw i32 [[ADD11]], [[T12]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i32 0
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[T12]], i32 0
+; CHECK-NEXT:    [[TMP17]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
@@ -129,7 +133,7 @@ for.body:
 ; YAML:      Function:        getelementptr_2x32
 ; YAML:     Args:
 ; YAML:        - String:          'SLP vectorized with cost '
-; YAML:        - Cost:            '4'
+; YAML:        - Cost:            '8'
 ; YAML-NEXT:   - String:          ' and with tree size '
 ; YAML-NEXT:   - TreeSize:        '3'
 
@@ -139,36 +143,42 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[Z:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[OP_RDX:%.*]] = extractelement <2 x i32> [[TMP13:%.*]], i32 0
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
 ; CHECK-NEXT:    ret i32 [[SUM_0_LCSSA]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[OP_RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i32> [[TMP6]], i32 1
+; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP12]], 1
+; CHECK-NEXT:    [[T5:%.*]] = add nsw i32 [[T4]], 0
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[T5]]
+; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP5]]
+; CHECK-NEXT:    [[T7:%.*]] = add nsw i32 [[T4]], 1
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T7]]
+; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i32 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
-; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[T11:%.*]] = add nsw i32 [[T4]], [[Z:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[T11]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP9]]
+; CHECK-NEXT:    [[T10:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
+; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i32 [[TMP10]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i32 2
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T12]], i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP10]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ADD11]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[T12]], i32 0
+; CHECK-NEXT:    [[TMP13]] = add nsw <2 x i32> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
index 1826e19d8e73..4847149c87a1 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr2.ll
@@ -14,7 +14,7 @@
 ; YAML-NEXT:  Function:        test_i16_extend
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-20'
+; YAML-NEXT:    - Cost:            '-16
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '5'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
index 9f5744b17cb7..929fb29a4a67 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll
@@ -600,15 +600,27 @@ bb15:                                             ; preds = %bb15, %bb14
 define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) {
 ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store <2 x i32> <i32 10, i32 300>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul float 1.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi float [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul float 3.000000e+01, 2.000000e+01
+; CHECK-NEXT:    [[TMP4:%.*]] = fptosi float [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300
+; CHECK-NEXT:    [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]]
+; CHECK-NEXT:    store i32 [[TMP7]], ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    store i32 [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]]
 ; CHECK:       bb14:
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 10 to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP7]] to i64
 ; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 2, [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[TMP20]], align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
index 9562e6d41f7c..fe3db7d462e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll
@@ -14,168 +14,389 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-LABEL: @straight(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2
+; CHECK-NEXT:    [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]]
+; CHECK-NEXT:    [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]]
+; CHECK-NEXT:    [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2
+; CHECK-NEXT:    [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]]
+; CHECK-NEXT:    [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]]
+; CHECK-NEXT:    [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]]
+; CHECK-NEXT:    [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2
+; CHECK-NEXT:    [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]]
+; CHECK-NEXT:    [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2
+; CHECK-NEXT:    [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]]
+; CHECK-NEXT:    [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]]
+; CHECK-NEXT:    [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2
+; CHECK-NEXT:    [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]]
+; CHECK-NEXT:    [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]]
+; CHECK-NEXT:    [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2
+; CHECK-NEXT:    [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]]
+; CHECK-NEXT:    [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]]
+; CHECK-NEXT:    [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2
+; CHECK-NEXT:    [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]]
+; CHECK-NEXT:    [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]]
+; CHECK-NEXT:    [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2
+; CHECK-NEXT:    [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]]
+; CHECK-NEXT:    [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]]
 ; CHECK-NEXT:    [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2
+; CHECK-NEXT:    [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]]
+; CHECK-NEXT:    [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]]
+; CHECK-NEXT:    [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]]
+; CHECK-NEXT:    [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2
+; CHECK-NEXT:    [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32
+; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]]
+; CHECK-NEXT:    [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]]
+; CHECK-NEXT:    [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]]
+; CHECK-NEXT:    [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2
+; CHECK-NEXT:    [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]]
+; CHECK-NEXT:    [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]]
+; CHECK-NEXT:    [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3
+; CHECK-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2
+; CHECK-NEXT:    [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]]
+; CHECK-NEXT:    [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2
+; CHECK-NEXT:    [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]]
+; CHECK-NEXT:    [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]]
+; CHECK-NEXT:    [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5
+; CHECK-NEXT:    [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2
+; CHECK-NEXT:    [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]]
+; CHECK-NEXT:    [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]]
+; CHECK-NEXT:    [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6
+; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2
+; CHECK-NEXT:    [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32
+; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]]
+; CHECK-NEXT:    [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]]
+; CHECK-NEXT:    [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7
+; CHECK-NEXT:    [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2
+; CHECK-NEXT:    [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32
+; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]]
+; CHECK-NEXT:    [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]]
 ; CHECK-NEXT:    [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2
+; CHECK-NEXT:    [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32
+; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]]
+; CHECK-NEXT:    [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]]
+; CHECK-NEXT:    [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]]
+; CHECK-NEXT:    [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2
+; CHECK-NEXT:    [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32
+; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]]
+; CHECK-NEXT:    [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]]
+; CHECK-NEXT:    [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]]
+; CHECK-NEXT:    [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2
+; CHECK-NEXT:    [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2
+; CHECK-NEXT:    [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32
+; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]]
+; CHECK-NEXT:    [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]]
+; CHECK-NEXT:    [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2
+; CHECK-NEXT:    [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32
+; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]]
+; CHECK-NEXT:    [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]]
+; CHECK-NEXT:    [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4
+; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2
+; CHECK-NEXT:    [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32
+; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]]
+; CHECK-NEXT:    [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5
+; CHECK-NEXT:    [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2
+; CHECK-NEXT:    [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32
+; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]]
+; CHECK-NEXT:    [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]]
+; CHECK-NEXT:    [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6
+; CHECK-NEXT:    [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2
+; CHECK-NEXT:    [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32
+; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]]
+; CHECK-NEXT:    [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]]
+; CHECK-NEXT:    [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7
+; CHECK-NEXT:    [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2
+; CHECK-NEXT:    [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32
+; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]]
+; CHECK-NEXT:    [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]]
 ; CHECK-NEXT:    [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2
+; CHECK-NEXT:    [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32
+; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]]
+; CHECK-NEXT:    [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]]
+; CHECK-NEXT:    [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]]
+; CHECK-NEXT:    [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1
+; CHECK-NEXT:    [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2
+; CHECK-NEXT:    [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32
+; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]]
+; CHECK-NEXT:    [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]]
+; CHECK-NEXT:    [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]]
+; CHECK-NEXT:    [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2
+; CHECK-NEXT:    [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32
+; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]]
+; CHECK-NEXT:    [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]]
+; CHECK-NEXT:    [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3
+; CHECK-NEXT:    [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2
+; CHECK-NEXT:    [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32
+; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]]
+; CHECK-NEXT:    [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]]
+; CHECK-NEXT:    [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2
+; CHECK-NEXT:    [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32
+; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]]
+; CHECK-NEXT:    [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]]
+; CHECK-NEXT:    [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5
+; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2
+; CHECK-NEXT:    [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32
+; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]]
+; CHECK-NEXT:    [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6
+; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2
+; CHECK-NEXT:    [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32
+; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]]
+; CHECK-NEXT:    [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]]
+; CHECK-NEXT:    [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7
+; CHECK-NEXT:    [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2
+; CHECK-NEXT:    [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32
+; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]]
+; CHECK-NEXT:    [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]]
 ; CHECK-NEXT:    [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2
+; CHECK-NEXT:    [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32
+; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]]
+; CHECK-NEXT:    [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]]
+; CHECK-NEXT:    [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]]
+; CHECK-NEXT:    [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1
+; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2
+; CHECK-NEXT:    [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32
+; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]]
+; CHECK-NEXT:    [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]]
+; CHECK-NEXT:    [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]]
+; CHECK-NEXT:    [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2
+; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2
+; CHECK-NEXT:    [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32
+; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]]
+; CHECK-NEXT:    [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]]
+; CHECK-NEXT:    [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2
+; CHECK-NEXT:    [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32
+; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]]
+; CHECK-NEXT:    [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]]
+; CHECK-NEXT:    [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4
+; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2
+; CHECK-NEXT:    [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32
+; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]]
+; CHECK-NEXT:    [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]]
+; CHECK-NEXT:    [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5
+; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2
+; CHECK-NEXT:    [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]]
+; CHECK-NEXT:    [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]]
+; CHECK-NEXT:    [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6
+; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2
+; CHECK-NEXT:    [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32
+; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]]
+; CHECK-NEXT:    [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7
+; CHECK-NEXT:    [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2
+; CHECK-NEXT:    [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32
+; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]]
+; CHECK-NEXT:    [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]]
 ; CHECK-NEXT:    [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]]
+; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2
+; CHECK-NEXT:    [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32
+; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]]
+; CHECK-NEXT:    [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]]
+; CHECK-NEXT:    [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]]
+; CHECK-NEXT:    [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1
+; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2
+; CHECK-NEXT:    [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]]
+; CHECK-NEXT:    [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]]
+; CHECK-NEXT:    [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]]
+; CHECK-NEXT:    [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2
+; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2
+; CHECK-NEXT:    [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32
+; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]]
+; CHECK-NEXT:    [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]]
+; CHECK-NEXT:    [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3
+; CHECK-NEXT:    [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2
+; CHECK-NEXT:    [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32
+; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]]
+; CHECK-NEXT:    [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]]
+; CHECK-NEXT:    [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4
+; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2
+; CHECK-NEXT:    [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32
+; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]]
+; CHECK-NEXT:    [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]]
+; CHECK-NEXT:    [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5
+; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2
+; CHECK-NEXT:    [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32
+; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]]
+; CHECK-NEXT:    [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]]
+; CHECK-NEXT:    [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6
+; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2
+; CHECK-NEXT:    [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32
+; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]]
+; CHECK-NEXT:    [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]]
+; CHECK-NEXT:    [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7
+; CHECK-NEXT:    [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2
+; CHECK-NEXT:    [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32
+; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]]
+; CHECK-NEXT:    [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]]
 ; CHECK-NEXT:    [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2
-; CHECK-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> [[TMP1]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <64 x i16> [[TMP10]], <64 x i16> [[TMP11]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <64 x i16> [[TMP12]], <64 x i16> [[TMP13]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP83:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP84:%.*]] = shufflevector <64 x i16> [[TMP14]], <64 x i16> [[TMP83]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP85:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <64 x i16> [[TMP84]], <64 x i16> [[TMP85]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP88:%.*]] = shufflevector <64 x i16> [[TMP86]], <64 x i16> [[TMP87]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP89:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP88]], <64 x i16> [[TMP89]], <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71>
-; CHECK-NEXT:    [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]]
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3
-; CHECK-NEXT:    [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5
-; CHECK-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6
-; CHECK-NEXT:    [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7
-; CHECK-NEXT:    [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]]
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8
-; CHECK-NEXT:    [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]]
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9
-; CHECK-NEXT:    [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]]
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10
-; CHECK-NEXT:    [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]]
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11
-; CHECK-NEXT:    [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]]
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12
-; CHECK-NEXT:    [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13
-; CHECK-NEXT:    [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]]
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14
-; CHECK-NEXT:    [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15
-; CHECK-NEXT:    [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]]
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16
-; CHECK-NEXT:    [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]]
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17
-; CHECK-NEXT:    [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]]
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18
-; CHECK-NEXT:    [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]]
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19
-; CHECK-NEXT:    [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]]
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20
-; CHECK-NEXT:    [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21
-; CHECK-NEXT:    [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]]
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22
-; CHECK-NEXT:    [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]]
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23
-; CHECK-NEXT:    [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24
-; CHECK-NEXT:    [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]]
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25
-; CHECK-NEXT:    [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]]
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26
-; CHECK-NEXT:    [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]]
-; CHECK-NEXT:    [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27
-; CHECK-NEXT:    [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28
-; CHECK-NEXT:    [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]]
-; CHECK-NEXT:    [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29
-; CHECK-NEXT:    [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]]
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30
-; CHECK-NEXT:    [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]]
-; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31
-; CHECK-NEXT:    [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32
-; CHECK-NEXT:    [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]]
-; CHECK-NEXT:    [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33
-; CHECK-NEXT:    [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]]
-; CHECK-NEXT:    [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34
-; CHECK-NEXT:    [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35
-; CHECK-NEXT:    [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]]
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36
-; CHECK-NEXT:    [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]]
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37
-; CHECK-NEXT:    [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]]
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38
-; CHECK-NEXT:    [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]]
-; CHECK-NEXT:    [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39
-; CHECK-NEXT:    [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]]
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40
-; CHECK-NEXT:    [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]]
-; CHECK-NEXT:    [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41
-; CHECK-NEXT:    [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]]
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42
-; CHECK-NEXT:    [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43
-; CHECK-NEXT:    [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]]
-; CHECK-NEXT:    [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44
-; CHECK-NEXT:    [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]]
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45
-; CHECK-NEXT:    [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]]
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46
-; CHECK-NEXT:    [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]]
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47
-; CHECK-NEXT:    [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]]
-; CHECK-NEXT:    [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48
-; CHECK-NEXT:    [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]]
-; CHECK-NEXT:    [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49
-; CHECK-NEXT:    [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]]
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50
-; CHECK-NEXT:    [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51
-; CHECK-NEXT:    [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]]
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52
-; CHECK-NEXT:    [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]]
-; CHECK-NEXT:    [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53
-; CHECK-NEXT:    [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]]
-; CHECK-NEXT:    [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54
-; CHECK-NEXT:    [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]]
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55
-; CHECK-NEXT:    [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56
-; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]]
-; CHECK-NEXT:    [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57
-; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]]
-; CHECK-NEXT:    [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58
-; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59
-; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]]
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60
-; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61
-; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]]
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62
-; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63
-; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]]
-; CHECK-NEXT:    [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2
+; CHECK-NEXT:    [[CONV_764:%.*]] = zext i16 [[TMP56]] to i32
+; CHECK-NEXT:    [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]]
+; CHECK-NEXT:    [[MUL_766:%.*]] = mul nuw nsw i32 [[CONV_764]], [[CONV_764]]
+; CHECK-NEXT:    [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]]
+; CHECK-NEXT:    [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1
+; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2
+; CHECK-NEXT:    [[CONV_1_7:%.*]] = zext i16 [[TMP57]] to i32
+; CHECK-NEXT:    [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]]
+; CHECK-NEXT:    [[MUL_1_7:%.*]] = mul nuw nsw i32 [[CONV_1_7]], [[CONV_1_7]]
+; CHECK-NEXT:    [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]]
+; CHECK-NEXT:    [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2
+; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2
+; CHECK-NEXT:    [[CONV_2_7:%.*]] = zext i16 [[TMP58]] to i32
+; CHECK-NEXT:    [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[MUL_2_7:%.*]] = mul nuw nsw i32 [[CONV_2_7]], [[CONV_2_7]]
+; CHECK-NEXT:    [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]]
+; CHECK-NEXT:    [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3
+; CHECK-NEXT:    [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2
+; CHECK-NEXT:    [[CONV_3_7:%.*]] = zext i16 [[TMP59]] to i32
+; CHECK-NEXT:    [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[MUL_3_7:%.*]] = mul nuw nsw i32 [[CONV_3_7]], [[CONV_3_7]]
+; CHECK-NEXT:    [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]]
+; CHECK-NEXT:    [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4
+; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2
+; CHECK-NEXT:    [[CONV_4_7:%.*]] = zext i16 [[TMP60]] to i32
+; CHECK-NEXT:    [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[MUL_4_7:%.*]] = mul nuw nsw i32 [[CONV_4_7]], [[CONV_4_7]]
+; CHECK-NEXT:    [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]]
+; CHECK-NEXT:    [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5
+; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2
+; CHECK-NEXT:    [[CONV_5_7:%.*]] = zext i16 [[TMP61]] to i32
+; CHECK-NEXT:    [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[MUL_5_7:%.*]] = mul nuw nsw i32 [[CONV_5_7]], [[CONV_5_7]]
+; CHECK-NEXT:    [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]]
+; CHECK-NEXT:    [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6
+; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2
+; CHECK-NEXT:    [[CONV_6_7:%.*]] = zext i16 [[TMP62]] to i32
+; CHECK-NEXT:    [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[MUL_6_7:%.*]] = mul nuw nsw i32 [[CONV_6_7]], [[CONV_6_7]]
+; CHECK-NEXT:    [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]]
+; CHECK-NEXT:    [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7
+; CHECK-NEXT:    [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2
+; CHECK-NEXT:    [[CONV_7_7:%.*]] = zext i16 [[TMP63]] to i32
+; CHECK-NEXT:    [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[MUL_7_7:%.*]] = mul nuw nsw i32 [[CONV_7_7]], [[CONV_7_7]]
+; CHECK-NEXT:    [[ADD11_7_7:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]]
 ; CHECK-NEXT:    [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64
-; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[TMP82]] to i64
+; CHECK-NEXT:    [[CONV16:%.*]] = zext i32 [[ADD11_7_7]] to i64
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32
 ; CHECK-NEXT:    [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]]
 ; CHECK-NEXT:    ret i64 [[ADD17]]
@@ -580,13 +801,101 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) {
 ; CHECK-NEXT:    [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
-; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
+; CHECK-NEXT:    [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]]
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]]
+; CHECK-NEXT:    [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]]
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
+; CHECK-NEXT:    [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]]
+; CHECK-NEXT:    [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]]
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
+; CHECK-NEXT:    [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]]
+; CHECK-NEXT:    [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]]
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
+; CHECK-NEXT:    [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]]
+; CHECK-NEXT:    [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]]
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
+; CHECK-NEXT:    [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32
+; CHECK-NEXT:    [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]]
+; CHECK-NEXT:    [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]]
+; CHECK-NEXT:    [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]]
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
+; CHECK-NEXT:    [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]]
+; CHECK-NEXT:    [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]]
+; CHECK-NEXT:    [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]]
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
+; CHECK-NEXT:    [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]]
+; CHECK-NEXT:    [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]]
+; CHECK-NEXT:    [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]]
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
+; CHECK-NEXT:    [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]]
+; CHECK-NEXT:    [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]]
+; CHECK-NEXT:    [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]]
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9
+; CHECK-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
+; CHECK-NEXT:    [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32
+; CHECK-NEXT:    [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]]
+; CHECK-NEXT:    [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]]
+; CHECK-NEXT:    [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]]
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
+; CHECK-NEXT:    [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32
+; CHECK-NEXT:    [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]]
+; CHECK-NEXT:    [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]]
+; CHECK-NEXT:    [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]]
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
+; CHECK-NEXT:    [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32
+; CHECK-NEXT:    [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]]
+; CHECK-NEXT:    [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]]
+; CHECK-NEXT:    [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]]
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12
+; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
+; CHECK-NEXT:    [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]]
+; CHECK-NEXT:    [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]]
+; CHECK-NEXT:    [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]]
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13
+; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
+; CHECK-NEXT:    [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]]
+; CHECK-NEXT:    [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]]
+; CHECK-NEXT:    [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]]
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
+; CHECK-NEXT:    [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]]
+; CHECK-NEXT:    [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]]
+; CHECK-NEXT:    [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]]
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
+; CHECK-NEXT:    [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[OP_RDX1]] = add i32 [[ADD_14]], [[CONV_15]]
+; CHECK-NEXT:    [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]]
+; CHECK-NEXT:    [[OP_RDX]] = add i32 [[MUL_15]], [[ADD11_14]]
 ; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]]
 ; CHECK-NEXT:    [[INC13]] = add nuw nsw i32 [[Y_038]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16
diff --git a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
similarity index 94%
rename from llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
rename to llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
index 4478eab7b827..15f4cffe7791 100644
--- a/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/phi-node-bitwidt-op-not.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index afaf6b98e508..094d60b66b39 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -90,14 +90,14 @@ entry:
 define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i64, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i64, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i64, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i64, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i64, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i64, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
@@ -131,14 +131,14 @@ entry:
 define void @splat_loads_i32(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-LABEL: @splat_loads_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_2_1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY2:%.*]], i64 1
-; CHECK-NEXT:    [[LD_2_0:%.*]] = load i32, ptr [[ARRAY2]], align 8
-; CHECK-NEXT:    [[LD_2_1:%.*]] = load i32, ptr [[GEP_2_1]], align 8
+; CHECK-NEXT:    [[GEP_2_2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY3:%.*]], i64 1
+; CHECK-NEXT:    [[LD_2_2:%.*]] = load i32, ptr [[ARRAY3]], align 8
+; CHECK-NEXT:    [[LD_2_3:%.*]] = load i32, ptr [[GEP_2_2]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAY1:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_2]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i32> [[TMP0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LD_2_3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i32> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[TMP3]], [[TMP6]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
index aeb82d800a2f..3c2f9e4d0ab5 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
@@ -4,17 +4,17 @@
 define void @test() {
 ; CHECK-LABEL: define void @test() {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_REAL32_PRE:%.*]] = load i32, ptr null, align 1
+; CHECK-NEXT:    [[G_2197_IMAG33_PRE:%.*]] = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1
 ; CHECK-NEXT:    br label %[[IF_END:.*]]
 ; CHECK:       [[IF_THEN:.*]]:
 ; CHECK-NEXT:    br label %[[IF_END]]
 ; CHECK:       [[IF_END]]:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[G_2197_IMAG33_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[G_2197_REAL32_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ]
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr null, align 1
 ; CHECK-NEXT:    br label %[[TRAP:.*]]
 ; CHECK:       [[BB3:.*:]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP4]], ptr null, align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[TRAP]]:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
index 3cb81b72d26a..14ce08cb7aeb 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll
@@ -6,19 +6,36 @@ define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
 ; CHECK-LABEL: define void @should_vectorize_gep
 ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64>
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3
-; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]]
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2
+; CHECK-NEXT:    [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2
+; CHECK-NEXT:    [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]]
+; CHECK-NEXT:    [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1
+; CHECK-NEXT:    [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2
+; CHECK-NEXT:    [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2
+; CHECK-NEXT:    [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64
+; CHECK-NEXT:    [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]]
+; CHECK-NEXT:    [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2
+; CHECK-NEXT:    [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2
+; CHECK-NEXT:    [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2
+; CHECK-NEXT:    [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64
+; CHECK-NEXT:    [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2
+; CHECK-NEXT:    [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64
+; CHECK-NEXT:    [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]]
+; CHECK-NEXT:    [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3
+; CHECK-NEXT:    [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2
+; CHECK-NEXT:    [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64
+; CHECK-NEXT:    [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2
+; CHECK-NEXT:    [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64
+; CHECK-NEXT:    [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]]
+; CHECK-NEXT:    [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]]
 ; CHECK-NEXT:    call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
index c728572313d7..cbf8bc9dcf8f 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll
@@ -249,7 +249,7 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) {
 ; CHECK-NEXT:    [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1
 ; CHECK-NEXT:    [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1
 ; CHECK-NEXT:    [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
new file mode 100644
index 000000000000..f90456297d7c
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ 0, [[BB10:%.*]] ], [ 0, [[BB:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10]] ], [ zeroinitializer, [[BB]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> <i32 poison, i32 undef>, i32 [[PHI7]], i32 0
+; CHECK-NEXT:    switch i32 0, label [[BB16:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB14:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB11:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb9:
+; CHECK-NEXT:    br label [[BB11]]
+; CHECK:       bb10:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb11:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x i32> [ poison, [[BB9:%.*]] ], [ [[TMP1]], [[BB1]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb14:
+; CHECK-NEXT:    ret void
+; CHECK:       bb15:
+; CHECK-NEXT:    ret void
+; CHECK:       bb16:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ], [ poison, [[BB25:%.*]] ]
+; CHECK-NEXT:    ret void
+; CHECK:       bb25:
+; CHECK-NEXT:    switch i32 0, label [[BB16]] [
+; CHECK-NEXT:      i32 0, label [[BB14]]
+; CHECK-NEXT:      i32 1, label [[BB15:%.*]]
+; CHECK-NEXT:    ]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi2 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi3 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi4 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi5 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi6 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi7 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  %phi8 = phi i32 [ 0, %bb10 ], [ 0, %bb ]
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb11
+  ]
+
+bb9:
+  br label %bb11
+
+bb10:
+  br label %bb1
+
+bb11:
+  %phi12 = phi i32 [ 0, %bb9 ], [ %phi7, %bb1 ]
+  %phi13 = phi i32 [ 0, %bb9 ], [ undef, %bb1 ]
+  ret void
+
+bb14:
+  ret void
+
+bb15:
+  ret void
+
+bb16:
+  %phi17 = phi i32 [ %phi, %bb1 ], [ 0, %bb25 ]
+  %phi18 = phi i32 [ %phi2, %bb1 ], [ 0, %bb25 ]
+  %phi19 = phi i32 [ %phi3, %bb1 ], [ 0, %bb25 ]
+  %phi20 = phi i32 [ %phi4, %bb1 ], [ 0, %bb25 ]
+  %phi21 = phi i32 [ %phi5, %bb1 ], [ 0, %bb25 ]
+  %phi22 = phi i32 [ %phi6, %bb1 ], [ 0, %bb25 ]
+  %phi23 = phi i32 [ %phi7, %bb1 ], [ 0, %bb25 ]
+  %phi24 = phi i32 [ %phi8, %bb1 ], [ 0, %bb25 ]
+  ret void
+
+bb25:
+  switch i32 0, label %bb16 [
+  i32 0, label %bb14
+  i32 1, label %bb15
+  ]
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
similarity index 91%
rename from llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
rename to llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
index 002b9a70255d..278e55c67f23 100644
--- a/llvm/test/Transforms/SLPVectorizer/icmp-altopcode-after-reordering.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/icmp-altopcode-after-reordering.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 
 define i32 @test(ptr %sptr, i64 %0) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
new file mode 100644
index 000000000000..0dac02b0bcc0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
@@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[B:%.*]], ptr [[C:%.*]], i32 [[TMP0:%.*]], ptr [[A:%.*]], i1 [[TOBOOL3_NOT:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[TOBOOL3_NOT]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = ashr <4 x i32> [[TMP2]], splat (i32 16)
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt <4 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i1> [[TMP4]] to <4 x i16>
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp sgt <4 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <4 x i1> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i1> poison, i1 [[TOBOOL3_NOT]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i1> [[TMP10]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP7]], <4 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <4 x i32> [[TMP12]], splat (i32 16)
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <4 x i32> [[TMP13]], splat (i32 16)
+; CHECK-NEXT:    [[TMP15:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i16>
+; CHECK-NEXT:    br i1 true, label [[BB3]], label [[BB2]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i16> [ [[TMP5]], [[BB1]] ], [ [[TMP15]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i16> [[TMP16]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i16 [[TMP17]] to i32
+; CHECK-NEXT:    store i32 [[TMP18]], ptr [[B]], align 16
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i16> [[TMP16]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    store i32 [[TMP20]], ptr [[A]], align 8
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x i16> [[TMP16]], i32 2
+; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    store i32 [[TMP22]], ptr [[C]], align 16
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x i16> [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    store i32 [[TMP24]], ptr [[B]], align 8
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  br i1 %tobool3.not, label %bb1, label %bb2
+
+bb1:
+  %conv1.i.us = ashr i32 %0, 16
+  %cmp2.i.us = icmp slt i32 %conv1.i.us, %0
+  %sext26.us = zext i1 %cmp2.i.us to i32
+  %conv1.i.us.5 = ashr i32 %0, 16
+  %cmp2.i.us.5 = icmp slt i32 %conv1.i.us.5, %0
+  %sext26.us.5 = zext i1 %cmp2.i.us.5 to i32
+  %conv1.i.us.6 = ashr i32 %0, 16
+  %cmp2.i.us.6 = icmp slt i32 %conv1.i.us.6, %0
+  %sext26.us.6 = zext i1 %cmp2.i.us.6 to i32
+  %conv1.i.us.7 = ashr i32 %0, 16
+  %cmp2.i.us.7 = icmp slt i32 %conv1.i.us.7, %0
+  %sext26.us.7 = zext i1 %cmp2.i.us.7 to i32
+  br label %bb3
+
+bb2:
+  %cmp2.i = icmp sgt i32 %0, 0
+  %1 = zext i1 %cmp2.i to i32
+  %cond.i = select i1 %tobool3.not, i32 %0, i32 %1
+  %sext26 = shl i32 %cond.i, 16
+  %conv13 = ashr i32 %sext26, 16
+  %cmp2.i.5 = icmp sgt i32 %0, 0
+  %2 = zext i1 %cmp2.i.5 to i32
+  %cond.i.5 = select i1 %tobool3.not, i32 %0, i32 %2
+  %sext26.5 = shl i32 %cond.i.5, 16
+  %conv13.5 = ashr i32 %sext26.5, 16
+  %cmp2.i.6 = icmp sgt i32 %0, 0
+  %3 = zext i1 %cmp2.i.6 to i32
+  %cond.i.6 = select i1 %tobool3.not, i32 %0, i32 %3
+  %sext26.6 = shl i32 %cond.i.6, 16
+  %conv13.6 = ashr i32 %sext26.6, 16
+  %cmp2.i.7 = icmp sgt i32 %0, 0
+  %4 = zext i1 %cmp2.i.7 to i32
+  %cond.i.7 = select i1 %tobool3.not, i32 %0, i32 %4
+  %sext26.7 = shl i32 %cond.i.7, 16
+  %conv13.7 = ashr i32 %sext26.7, 16
+  br i1 true, label %bb3, label %bb2
+
+bb3:
+  %conv13p = phi i32 [ %sext26.us, %bb1 ], [ %conv13, %bb2 ]
+  %conv13.5p = phi i32 [ %sext26.us.5, %bb1 ], [ %conv13.5, %bb2 ]
+  %conv13.6p = phi i32 [ %sext26.us.6, %bb1 ], [ %conv13.6, %bb2 ]
+  %conv13.7p = phi i32 [ %sext26.us.7, %bb1 ], [ %conv13.7, %bb2 ]
+  store i32 %conv13p, ptr %b, align 16
+  store i32 %conv13.5p, ptr %a, align 8
+  store i32 %conv13.6p, ptr %c, align 16
+  store i32 %conv13.7p, ptr %b, align 8
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revectorize-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/revectorize-phis.ll
new file mode 100644
index 000000000000..9eec3074b534
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revectorize-phis.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[BB:.*]]:
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI]], %[[BB31:.*]] ], [ [[PHI]], %[[BB32:.*]] ], [ [[PHI]], %[[BB33:.*]] ], [ [[PHI]], %[[BB36:.*]] ], [ [[PHI]], %[[BB39:.*]] ], [ [[PHI]], %[[BB40:.*]] ], [ [[PHI]], %[[BB41:.*]] ], [ [[PHI]], %[[BB42:.*]] ], [ [[PHI]], %[[BB43:.*]] ], [ 0, %[[BB45:.*]] ], [ [[PHI]], %[[BB21:.*]] ], [ [[PHI]], %[[BB22:.*]] ], [ [[PHI]], %[[BB23:.*]] ], [ [[PHI]], %[[BB24:.*]] ], [ [[PHI]], %[[BB25:.*]] ], [ [[PHI]], %[[BB26:.*]] ], [ [[PHI]], %[[BB27:.*]] ], [ [[PHI]], %[[BB28:.*]] ], [ [[PHI]], %[[BB29:.*]] ], [ 0, %[[BB30:.*]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i64 [ undef, %[[BB]] ], [ [[PHI2]], %[[BB31]] ], [ [[PHI2]], %[[BB32]] ], [ [[PHI2]], %[[BB33]] ], [ [[PHI2]], %[[BB36]] ], [ [[PHI2]], %[[BB39]] ], [ [[PHI2]], %[[BB40]] ], [ [[PHI2]], %[[BB41]] ], [ [[PHI2]], %[[BB42]] ], [ [[LOAD44:%.*]], %[[BB43]] ], [ [[PHI2]], %[[BB45]] ], [ [[PHI2]], %[[BB21]] ], [ 0, %[[BB22]] ], [ [[PHI2]], %[[BB23]] ], [ [[PHI2]], %[[BB24]] ], [ [[PHI2]], %[[BB25]] ], [ [[PHI2]], %[[BB26]] ], [ [[PHI2]], %[[BB27]] ], [ [[PHI2]], %[[BB28]] ], [ [[PHI2]], %[[BB29]] ], [ [[PHI2]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI3:%.*]] = phi i64 [ 0, %[[BB]] ], [ 0, %[[BB31]] ], [ [[PHI3]], %[[BB32]] ], [ [[PHI3]], %[[BB33]] ], [ [[PHI3]], %[[BB36]] ], [ [[PHI3]], %[[BB39]] ], [ [[PHI3]], %[[BB40]] ], [ [[PHI3]], %[[BB41]] ], [ 0, %[[BB42]] ], [ [[PHI3]], %[[BB43]] ], [ [[PHI3]], %[[BB45]] ], [ [[PHI3]], %[[BB21]] ], [ [[PHI3]], %[[BB22]] ], [ [[PHI3]], %[[BB23]] ], [ [[PHI3]], %[[BB24]] ], [ [[PHI3]], %[[BB25]] ], [ [[PHI3]], %[[BB26]] ], [ [[PHI3]], %[[BB27]] ], [ [[PHI3]], %[[BB28]] ], [ [[PHI3]], %[[BB29]] ], [ [[PHI3]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI4:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI4]], %[[BB31]] ], [ [[PHI4]], %[[BB32]] ], [ [[PHI4]], %[[BB33]] ], [ [[PHI4]], %[[BB36]] ], [ [[PHI4]], %[[BB39]] ], [ [[PHI4]], %[[BB40]] ], [ [[PHI4]], %[[BB41]] ], [ [[PHI4]], %[[BB42]] ], [ [[PHI4]], %[[BB43]] ], [ [[PHI4]], %[[BB45]] ], [ [[PHI4]], %[[BB21]] ], [ [[PHI4]], %[[BB22]] ], [ [[PHI4]], %[[BB23]] ], [ [[PHI4]], %[[BB24]] ], [ [[PHI4]], %[[BB25]] ], [ [[PHI4]], %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI4]], %[[BB28]] ], [ [[PHI4]], %[[BB29]] ], [ [[PHI4]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI5:%.*]] = phi i64 [ undef, %[[BB]] ], [ [[PHI5]], %[[BB31]] ], [ [[PHI5]], %[[BB32]] ], [ [[PHI5]], %[[BB33]] ], [ [[PHI5]], %[[BB36]] ], [ 0, %[[BB39]] ], [ [[PHI5]], %[[BB40]] ], [ [[PHI5]], %[[BB41]] ], [ [[PHI5]], %[[BB42]] ], [ [[PHI5]], %[[BB43]] ], [ [[PHI5]], %[[BB45]] ], [ 0, %[[BB21]] ], [ [[PHI5]], %[[BB22]] ], [ [[PHI5]], %[[BB23]] ], [ [[PHI5]], %[[BB24]] ], [ [[PHI5]], %[[BB25]] ], [ [[PHI5]], %[[BB26]] ], [ [[PHI5]], %[[BB27]] ], [ [[PHI5]], %[[BB28]] ], [ [[PHI5]], %[[BB29]] ], [ [[PHI5]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI6:%.*]] = phi i64 [ undef, %[[BB]] ], [ [[PHI6]], %[[BB31]] ], [ 0, %[[BB32]] ], [ 0, %[[BB33]] ], [ [[LOAD38:%.*]], %[[BB36]] ], [ 0, %[[BB39]] ], [ 0, %[[BB40]] ], [ [[PHI6]], %[[BB41]] ], [ [[PHI6]], %[[BB42]] ], [ [[PHI6]], %[[BB43]] ], [ [[PHI6]], %[[BB45]] ], [ [[PHI6]], %[[BB21]] ], [ 0, %[[BB22]] ], [ [[PHI6]], %[[BB23]] ], [ [[PHI6]], %[[BB24]] ], [ [[PHI6]], %[[BB25]] ], [ 0, %[[BB26]] ], [ [[PHI6]], %[[BB27]] ], [ [[PHI6]], %[[BB28]] ], [ [[PHI6]], %[[BB29]] ], [ [[PHI6]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i64 [ undef, %[[BB]] ], [ [[PHI7]], %[[BB31]] ], [ [[PTRTOINT:%.*]], %[[BB32]] ], [ [[PHI7]], %[[BB33]] ], [ [[PHI7]], %[[BB36]] ], [ [[PHI7]], %[[BB39]] ], [ [[PHI7]], %[[BB40]] ], [ [[PHI7]], %[[BB41]] ], [ [[PHI7]], %[[BB42]] ], [ [[PHI7]], %[[BB43]] ], [ 0, %[[BB45]] ], [ [[PHI7]], %[[BB21]] ], [ [[PHI7]], %[[BB22]] ], [ [[PHI7]], %[[BB23]] ], [ [[PHI7]], %[[BB24]] ], [ [[PHI7]], %[[BB25]] ], [ [[PHI7]], %[[BB26]] ], [ [[PHI7]], %[[BB27]] ], [ [[PHI7]], %[[BB28]] ], [ [[PHI7]], %[[BB29]] ], [ [[PHI7]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI8:%.*]] = phi i64 [ 0, %[[BB]] ], [ 0, %[[BB31]] ], [ [[PHI8]], %[[BB32]] ], [ [[PHI8]], %[[BB33]] ], [ [[PHI8]], %[[BB36]] ], [ [[PHI8]], %[[BB39]] ], [ [[PHI8]], %[[BB40]] ], [ [[PHI8]], %[[BB41]] ], [ [[PHI8]], %[[BB42]] ], [ [[PHI8]], %[[BB43]] ], [ [[PHI8]], %[[BB45]] ], [ [[PHI8]], %[[BB21]] ], [ [[PHI8]], %[[BB22]] ], [ [[PHI8]], %[[BB23]] ], [ [[PHI8]], %[[BB24]] ], [ [[PHI8]], %[[BB25]] ], [ [[PHI8]], %[[BB26]] ], [ [[PHI8]], %[[BB27]] ], [ [[PHI8]], %[[BB28]] ], [ [[PHI8]], %[[BB29]] ], [ [[PHI8]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI9:%.*]] = phi i64 [ undef, %[[BB]] ], [ [[PHI9]], %[[BB31]] ], [ [[PHI9]], %[[BB32]] ], [ [[PHI9]], %[[BB33]] ], [ [[PHI9]], %[[BB36]] ], [ [[PHI9]], %[[BB39]] ], [ [[PHI9]], %[[BB40]] ], [ [[PHI9]], %[[BB41]] ], [ 0, %[[BB42]] ], [ [[PHI9]], %[[BB43]] ], [ [[LOAD46:%.*]], %[[BB45]] ], [ [[PHI9]], %[[BB21]] ], [ [[PHI9]], %[[BB22]] ], [ 0, %[[BB23]] ], [ [[PHI9]], %[[BB24]] ], [ [[PHI9]], %[[BB25]] ], [ [[PHI9]], %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI9]], %[[BB28]] ], [ [[PHI9]], %[[BB29]] ], [ [[PHI]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI10:%.*]] = phi i64 [ 0, %[[BB]] ], [ 0, %[[BB31]] ], [ [[PHI10]], %[[BB32]] ], [ [[PHI10]], %[[BB33]] ], [ [[PHI10]], %[[BB36]] ], [ 0, %[[BB39]] ], [ [[PHI10]], %[[BB40]] ], [ [[PHI10]], %[[BB41]] ], [ [[PHI10]], %[[BB42]] ], [ 0, %[[BB43]] ], [ [[PHI10]], %[[BB45]] ], [ [[PHI10]], %[[BB21]] ], [ [[PHI10]], %[[BB22]] ], [ [[PHI10]], %[[BB23]] ], [ 0, %[[BB24]] ], [ [[PHI10]], %[[BB25]] ], [ [[PHI10]], %[[BB26]] ], [ [[PHI10]], %[[BB27]] ], [ [[PHI10]], %[[BB28]] ], [ [[PHI2]], %[[BB29]] ], [ [[PHI10]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI11:%.*]] = phi i64 [ undef, %[[BB]] ], [ [[PHI11]], %[[BB31]] ], [ [[PHI11]], %[[BB32]] ], [ [[PHI11]], %[[BB33]] ], [ [[PHI11]], %[[BB36]] ], [ [[PHI11]], %[[BB39]] ], [ [[PHI11]], %[[BB40]] ], [ [[PHI11]], %[[BB41]] ], [ 0, %[[BB42]] ], [ 0, %[[BB43]] ], [ [[PHI11]], %[[BB45]] ], [ [[PHI11]], %[[BB21]] ], [ [[PHI11]], %[[BB22]] ], [ [[PHI11]], %[[BB23]] ], [ [[PHI11]], %[[BB24]] ], [ [[PHI11]], %[[BB25]] ], [ [[PHI11]], %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI3]], %[[BB28]] ], [ [[PHI11]], %[[BB29]] ], [ [[PHI11]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI12:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI12]], %[[BB31]] ], [ 0, %[[BB32]] ], [ [[PHI12]], %[[BB33]] ], [ [[LOAD37:%.*]], %[[BB36]] ], [ [[PHI12]], %[[BB39]] ], [ [[PHI12]], %[[BB40]] ], [ [[PHI12]], %[[BB41]] ], [ [[PHI12]], %[[BB42]] ], [ [[PHI12]], %[[BB43]] ], [ [[PHI12]], %[[BB45]] ], [ [[PHI12]], %[[BB21]] ], [ [[PHI12]], %[[BB22]] ], [ [[PHI12]], %[[BB23]] ], [ 1, %[[BB24]] ], [ [[PHI12]], %[[BB25]] ], [ 0, %[[BB26]] ], [ [[PHI12]], %[[BB27]] ], [ [[PHI12]], %[[BB28]] ], [ [[PHI12]], %[[BB29]] ], [ [[PHI12]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI13:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI13]], %[[BB31]] ], [ [[PHI13]], %[[BB32]] ], [ [[LOAD35:%.*]], %[[BB33]] ], [ [[PHI13]], %[[BB36]] ], [ [[PHI13]], %[[BB39]] ], [ [[PHI13]], %[[BB40]] ], [ [[PHI13]], %[[BB41]] ], [ 0, %[[BB42]] ], [ [[PHI13]], %[[BB43]] ], [ [[PHI13]], %[[BB45]] ], [ [[PHI13]], %[[BB21]] ], [ [[PHI13]], %[[BB22]] ], [ 0, %[[BB23]] ], [ 1, %[[BB24]] ], [ [[PHI13]], %[[BB25]] ], [ 0, %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI13]], %[[BB28]] ], [ [[PHI13]], %[[BB29]] ], [ [[PHI13]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI14:%.*]] = phi i64 [ 0, %[[BB]] ], [ 0, %[[BB31]] ], [ [[PHI14]], %[[BB32]] ], [ [[LOAD34:%.*]], %[[BB33]] ], [ 0, %[[BB36]] ], [ [[PHI14]], %[[BB39]] ], [ [[PHI14]], %[[BB40]] ], [ [[PHI14]], %[[BB41]] ], [ [[PHI14]], %[[BB42]] ], [ [[PHI14]], %[[BB43]] ], [ [[PHI14]], %[[BB45]] ], [ [[PHI14]], %[[BB21]] ], [ [[PHI14]], %[[BB22]] ], [ 0, %[[BB23]] ], [ [[PHI14]], %[[BB24]] ], [ [[PHI14]], %[[BB25]] ], [ [[PHI14]], %[[BB26]] ], [ [[PHI14]], %[[BB27]] ], [ [[PHI14]], %[[BB28]] ], [ [[PHI14]], %[[BB29]] ], [ [[PHI14]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI15:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[LOAD:%.*]], %[[BB31]] ], [ [[PHI15]], %[[BB32]] ], [ [[PHI15]], %[[BB33]] ], [ [[PHI15]], %[[BB36]] ], [ 0, %[[BB39]] ], [ [[PHI15]], %[[BB40]] ], [ [[PHI15]], %[[BB41]] ], [ 0, %[[BB42]] ], [ [[PHI15]], %[[BB43]] ], [ [[PHI15]], %[[BB45]] ], [ [[PHI8]], %[[BB21]] ], [ [[PHI15]], %[[BB22]] ], [ [[PHI15]], %[[BB23]] ], [ [[PHI15]], %[[BB24]] ], [ [[PHI15]], %[[BB25]] ], [ 0, %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI15]], %[[BB28]] ], [ [[PHI15]], %[[BB29]] ], [ [[PHI15]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI16:%.*]] = phi i8 [ 3, %[[BB]] ], [ 0, %[[BB31]] ], [ 0, %[[BB32]] ], [ 0, %[[BB33]] ], [ 0, %[[BB36]] ], [ 0, %[[BB39]] ], [ 0, %[[BB40]] ], [ 0, %[[BB41]] ], [ 0, %[[BB42]] ], [ 0, %[[BB43]] ], [ 0, %[[BB45]] ], [ 0, %[[BB21]] ], [ 0, %[[BB22]] ], [ 0, %[[BB23]] ], [ 0, %[[BB24]] ], [ 0, %[[BB25]] ], [ 0, %[[BB26]] ], [ 0, %[[BB27]] ], [ 0, %[[BB28]] ], [ 0, %[[BB29]] ], [ 0, %[[BB30]] ]
+; CHECK-NEXT:    [[PHI17:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI17]], %[[BB31]] ], [ [[PHI17]], %[[BB32]] ], [ [[PHI17]], %[[BB33]] ], [ [[PHI17]], %[[BB36]] ], [ 0, %[[BB39]] ], [ [[PHI17]], %[[BB40]] ], [ [[PHI4]], %[[BB41]] ], [ [[PHI17]], %[[BB42]] ], [ 0, %[[BB43]] ], [ [[PHI17]], %[[BB45]] ], [ [[PHI17]], %[[BB21]] ], [ [[PHI17]], %[[BB22]] ], [ [[PHI17]], %[[BB23]] ], [ 0, %[[BB24]] ], [ [[PHI17]], %[[BB25]] ], [ 0, %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI17]], %[[BB28]] ], [ [[PHI17]], %[[BB29]] ], [ [[PHI17]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI18:%.*]] = phi i8 [ 0, %[[BB]] ], [ 0, %[[BB31]] ], [ 0, %[[BB32]] ], [ 0, %[[BB33]] ], [ 0, %[[BB36]] ], [ 0, %[[BB39]] ], [ 0, %[[BB40]] ], [ 0, %[[BB41]] ], [ 0, %[[BB42]] ], [ 0, %[[BB43]] ], [ 0, %[[BB45]] ], [ 0, %[[BB21]] ], [ 0, %[[BB22]] ], [ 0, %[[BB23]] ], [ 0, %[[BB24]] ], [ 0, %[[BB25]] ], [ 0, %[[BB26]] ], [ 0, %[[BB27]] ], [ 0, %[[BB28]] ], [ 0, %[[BB29]] ], [ 0, %[[BB30]] ]
+; CHECK-NEXT:    [[PHI19:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI19]], %[[BB31]] ], [ [[PHI19]], %[[BB32]] ], [ [[PHI19]], %[[BB33]] ], [ [[PHI19]], %[[BB36]] ], [ 0, %[[BB39]] ], [ [[PHI19]], %[[BB40]] ], [ [[PHI19]], %[[BB41]] ], [ [[PHI19]], %[[BB42]] ], [ 0, %[[BB43]] ], [ [[PHI19]], %[[BB45]] ], [ [[PHI19]], %[[BB21]] ], [ [[PHI19]], %[[BB22]] ], [ [[PHI19]], %[[BB23]] ], [ [[PHI19]], %[[BB24]] ], [ [[PHI5]], %[[BB25]] ], [ 0, %[[BB26]] ], [ 0, %[[BB27]] ], [ [[PHI19]], %[[BB28]] ], [ [[PHI19]], %[[BB29]] ], [ [[PHI19]], %[[BB30]] ]
+; CHECK-NEXT:    [[PHI20:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[PHI20]], %[[BB31]] ], [ [[PTRTOINT]], %[[BB32]] ], [ [[PHI20]], %[[BB33]] ], [ [[PHI20]], %[[BB36]] ], [ [[PHI20]], %[[BB39]] ], [ 0, %[[BB40]] ], [ [[PHI20]], %[[BB41]] ], [ 0, %[[BB42]] ], [ [[PHI20]], %[[BB43]] ], [ [[PHI20]], %[[BB45]] ], [ [[PHI20]], %[[BB21]] ], [ [[PHI7]], %[[BB22]] ], [ [[PHI20]], %[[BB23]] ], [ [[PHI20]], %[[BB24]] ], [ [[PHI20]], %[[BB25]] ], [ 0, %[[BB26]] ], [ [[PHI20]], %[[BB27]] ], [ [[PHI20]], %[[BB28]] ], [ [[PHI20]], %[[BB29]] ], [ [[PHI20]], %[[BB30]] ]
+; CHECK-NEXT:    switch i8 0, label %[[BB30]] [
+; CHECK-NEXT:      i8 9, label %[[BB26]]
+; CHECK-NEXT:      i8 0, label %[[BB29]]
+; CHECK-NEXT:      i8 13, label %[[BB28]]
+; CHECK-NEXT:      i8 1, label %[[BB21]]
+; CHECK-NEXT:      i8 3, label %[[BB22]]
+; CHECK-NEXT:      i8 12, label %[[BB27]]
+; CHECK-NEXT:      i8 5, label %[[BB23]]
+; CHECK-NEXT:      i8 6, label %[[BB24]]
+; CHECK-NEXT:      i8 7, label %[[BB25]]
+; CHECK-NEXT:    ]
+; CHECK:       [[BB21]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB31]]
+; CHECK:       [[BB22]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB32]]
+; CHECK:       [[BB23]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB33]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB36]]
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB39]]
+; CHECK:       [[BB26]]:
+; CHECK-NEXT:    br i1 false, label %[[BB40]], label %[[BB1]]
+; CHECK:       [[BB27]]:
+; CHECK-NEXT:    br i1 false, label %[[BB41]], label %[[BB1]]
+; CHECK:       [[BB28]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB42]]
+; CHECK:       [[BB29]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB43]]
+; CHECK:       [[BB30]]:
+; CHECK-NEXT:    br i1 false, label %[[BB1]], label %[[BB45]]
+; CHECK:       [[BB31]]:
+; CHECK-NEXT:    [[LOAD]] = load i64, ptr null, align 8
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB32]]:
+; CHECK-NEXT:    [[PTRTOINT]] = ptrtoint ptr null to i64
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB33]]:
+; CHECK-NEXT:    [[LOAD34]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[LOAD35]] = load i64, ptr null, align 8
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[LOAD37]] = load i64, ptr null, align 8
+; CHECK-NEXT:    [[LOAD38]] = load i64, ptr null, align 8
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB39]]:
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB40]]:
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB43]]:
+; CHECK-NEXT:    [[LOAD44]] = load i64, ptr null, align 8
+; CHECK-NEXT:    br label %[[BB1]]
+; CHECK:       [[BB45]]:
+; CHECK-NEXT:    [[LOAD46]] = load i64, ptr null, align 8
+; CHECK-NEXT:    br label %[[BB1]]
+;
+bb:
+  br label %bb1
+
+bb1:
+  %phi = phi i64 [ 0, %bb ], [ %phi, %bb31 ], [ %phi, %bb32 ], [ %phi, %bb33 ], [ %phi, %bb36 ], [ %phi, %bb39 ], [ %phi, %bb40 ], [ %phi, %bb41 ], [ %phi, %bb42 ], [ %phi, %bb43 ], [ 0, %bb45 ], [ %phi, %bb21 ], [ %phi, %bb22 ], [ %phi, %bb23 ], [ %phi, %bb24 ], [ %phi, %bb25 ], [ %phi, %bb26 ], [ %phi, %bb27 ], [ %phi, %bb28 ], [ %phi, %bb29 ], [ 0, %bb30 ]
+  %phi2 = phi i64 [ undef, %bb ], [ %phi2, %bb31 ], [ %phi2, %bb32 ], [ %phi2, %bb33 ], [ %phi2, %bb36 ], [ %phi2, %bb39 ], [ %phi2, %bb40 ], [ %phi2, %bb41 ], [ %phi2, %bb42 ], [ %load44, %bb43 ], [ %phi2, %bb45 ], [ %phi2, %bb21 ], [ 0, %bb22 ], [ %phi2, %bb23 ], [ %phi2, %bb24 ], [ %phi2, %bb25 ], [ %phi2, %bb26 ], [ %phi2, %bb27 ], [ %phi2, %bb28 ], [ %phi2, %bb29 ], [ %phi2, %bb30 ]
+  %phi3 = phi i64 [ 0, %bb ], [ 0, %bb31 ], [ %phi3, %bb32 ], [ %phi3, %bb33 ], [ %phi3, %bb36 ], [ %phi3, %bb39 ], [ %phi3, %bb40 ], [ %phi3, %bb41 ], [ 0, %bb42 ], [ %phi3, %bb43 ], [ %phi3, %bb45 ], [ %phi3, %bb21 ], [ %phi3, %bb22 ], [ %phi3, %bb23 ], [ %phi3, %bb24 ], [ %phi3, %bb25 ], [ %phi3, %bb26 ], [ %phi3, %bb27 ], [ %phi3, %bb28 ], [ %phi3, %bb29 ], [ %phi3, %bb30 ]
+  %phi4 = phi i64 [ 0, %bb ], [ %phi4, %bb31 ], [ %phi4, %bb32 ], [ %phi4, %bb33 ], [ %phi4, %bb36 ], [ %phi4, %bb39 ], [ %phi4, %bb40 ], [ %phi4, %bb41 ], [ %phi4, %bb42 ], [ %phi4, %bb43 ], [ %phi4, %bb45 ], [ %phi4, %bb21 ], [ %phi4, %bb22 ], [ %phi4, %bb23 ], [ %phi4, %bb24 ], [ %phi4, %bb25 ], [ %phi4, %bb26 ], [ 0, %bb27 ], [ %phi4, %bb28 ], [ %phi4, %bb29 ], [ %phi4, %bb30 ]
+  %phi5 = phi i64 [ undef, %bb ], [ %phi5, %bb31 ], [ %phi5, %bb32 ], [ %phi5, %bb33 ], [ %phi5, %bb36 ], [ 0, %bb39 ], [ %phi5, %bb40 ], [ %phi5, %bb41 ], [ %phi5, %bb42 ], [ %phi5, %bb43 ], [ %phi5, %bb45 ], [ 0, %bb21 ], [ %phi5, %bb22 ], [ %phi5, %bb23 ], [ %phi5, %bb24 ], [ %phi5, %bb25 ], [ %phi5, %bb26 ], [ %phi5, %bb27 ], [ %phi5, %bb28 ], [ %phi5, %bb29 ], [ %phi5, %bb30 ]
+  %phi6 = phi i64 [ undef, %bb ], [ %phi6, %bb31 ], [ 0, %bb32 ], [ 0, %bb33 ], [ %load38, %bb36 ], [ 0, %bb39 ], [ 0, %bb40 ], [ %phi6, %bb41 ], [ %phi6, %bb42 ], [ %phi6, %bb43 ], [ %phi6, %bb45 ], [ %phi6, %bb21 ], [ 0, %bb22 ], [ %phi6, %bb23 ], [ %phi6, %bb24 ], [ %phi6, %bb25 ], [ 0, %bb26 ], [ %phi6, %bb27 ], [ %phi6, %bb28 ], [ %phi6, %bb29 ], [ %phi6, %bb30 ]
+  %phi7 = phi i64 [ undef, %bb ], [ %phi7, %bb31 ], [ %ptrtoint, %bb32 ], [ %phi7, %bb33 ], [ %phi7, %bb36 ], [ %phi7, %bb39 ], [ %phi7, %bb40 ], [ %phi7, %bb41 ], [ %phi7, %bb42 ], [ %phi7, %bb43 ], [ 0, %bb45 ], [ %phi7, %bb21 ], [ %phi7, %bb22 ], [ %phi7, %bb23 ], [ %phi7, %bb24 ], [ %phi7, %bb25 ], [ %phi7, %bb26 ], [ %phi7, %bb27 ], [ %phi7, %bb28 ], [ %phi7, %bb29 ], [ %phi7, %bb30 ]
+  %phi8 = phi i64 [ 0, %bb ], [ 0, %bb31 ], [ %phi8, %bb32 ], [ %phi8, %bb33 ], [ %phi8, %bb36 ], [ %phi8, %bb39 ], [ %phi8, %bb40 ], [ %phi8, %bb41 ], [ %phi8, %bb42 ], [ %phi8, %bb43 ], [ %phi8, %bb45 ], [ %phi8, %bb21 ], [ %phi8, %bb22 ], [ %phi8, %bb23 ], [ %phi8, %bb24 ], [ %phi8, %bb25 ], [ %phi8, %bb26 ], [ %phi8, %bb27 ], [ %phi8, %bb28 ], [ %phi8, %bb29 ], [ %phi8, %bb30 ]
+  %phi9 = phi i64 [ undef, %bb ], [ %phi9, %bb31 ], [ %phi9, %bb32 ], [ %phi9, %bb33 ], [ %phi9, %bb36 ], [ %phi9, %bb39 ], [ %phi9, %bb40 ], [ %phi9, %bb41 ], [ 0, %bb42 ], [ %phi9, %bb43 ], [ %load46, %bb45 ], [ %phi9, %bb21 ], [ %phi9, %bb22 ], [ 0, %bb23 ], [ %phi9, %bb24 ], [ %phi9, %bb25 ], [ %phi9, %bb26 ], [ 0, %bb27 ], [ %phi9, %bb28 ], [ %phi9, %bb29 ], [ %phi, %bb30 ]
+  %phi10 = phi i64 [ 0, %bb ], [ 0, %bb31 ], [ %phi10, %bb32 ], [ %phi10, %bb33 ], [ %phi10, %bb36 ], [ 0, %bb39 ], [ %phi10, %bb40 ], [ %phi10, %bb41 ], [ %phi10, %bb42 ], [ 0, %bb43 ], [ %phi10, %bb45 ], [ %phi10, %bb21 ], [ %phi10, %bb22 ], [ %phi10, %bb23 ], [ 0, %bb24 ], [ %phi10, %bb25 ], [ %phi10, %bb26 ], [ %phi10, %bb27 ], [ %phi10, %bb28 ], [ %phi2, %bb29 ], [ %phi10, %bb30 ]
+  %phi11 = phi i64 [ undef, %bb ], [ %phi11, %bb31 ], [ %phi11, %bb32 ], [ %phi11, %bb33 ], [ %phi11, %bb36 ], [ %phi11, %bb39 ], [ %phi11, %bb40 ], [ %phi11, %bb41 ], [ 0, %bb42 ], [ 0, %bb43 ], [ %phi11, %bb45 ], [ %phi11, %bb21 ], [ %phi11, %bb22 ], [ %phi11, %bb23 ], [ %phi11, %bb24 ], [ %phi11, %bb25 ], [ %phi11, %bb26 ], [ 0, %bb27 ], [ %phi3, %bb28 ], [ %phi11, %bb29 ], [ %phi11, %bb30 ]
+  %phi12 = phi i64 [ 0, %bb ], [ %phi12, %bb31 ], [ 0, %bb32 ], [ %phi12, %bb33 ], [ %load37, %bb36 ], [ %phi12, %bb39 ], [ %phi12, %bb40 ], [ %phi12, %bb41 ], [ %phi12, %bb42 ], [ %phi12, %bb43 ], [ %phi12, %bb45 ], [ %phi12, %bb21 ], [ %phi12, %bb22 ], [ %phi12, %bb23 ], [ 1, %bb24 ], [ %phi12, %bb25 ], [ 0, %bb26 ], [ %phi12, %bb27 ], [ %phi12, %bb28 ], [ %phi12, %bb29 ], [ %phi12, %bb30 ]
+  %phi13 = phi i64 [ 0, %bb ], [ %phi13, %bb31 ], [ %phi13, %bb32 ], [ %load35, %bb33 ], [ %phi13, %bb36 ], [ %phi13, %bb39 ], [ %phi13, %bb40 ], [ %phi13, %bb41 ], [ 0, %bb42 ], [ %phi13, %bb43 ], [ %phi13, %bb45 ], [ %phi13, %bb21 ], [ %phi13, %bb22 ], [ 0, %bb23 ], [ 1, %bb24 ], [ %phi13, %bb25 ], [ 0, %bb26 ], [ 0, %bb27 ], [ %phi13, %bb28 ], [ %phi13, %bb29 ], [ %phi13, %bb30 ]
+  %phi14 = phi i64 [ 0, %bb ], [ 0, %bb31 ], [ %phi14, %bb32 ], [ %load34, %bb33 ], [ 0, %bb36 ], [ %phi14, %bb39 ], [ %phi14, %bb40 ], [ %phi14, %bb41 ], [ %phi14, %bb42 ], [ %phi14, %bb43 ], [ %phi14, %bb45 ], [ %phi14, %bb21 ], [ %phi14, %bb22 ], [ 0, %bb23 ], [ %phi14, %bb24 ], [ %phi14, %bb25 ], [ %phi14, %bb26 ], [ %phi14, %bb27 ], [ %phi14, %bb28 ], [ %phi14, %bb29 ], [ %phi14, %bb30 ]
+  %phi15 = phi i64 [ 0, %bb ], [ %load, %bb31 ], [ %phi15, %bb32 ], [ %phi15, %bb33 ], [ %phi15, %bb36 ], [ 0, %bb39 ], [ %phi15, %bb40 ], [ %phi15, %bb41 ], [ 0, %bb42 ], [ %phi15, %bb43 ], [ %phi15, %bb45 ], [ %phi8, %bb21 ], [ %phi15, %bb22 ], [ %phi15, %bb23 ], [ %phi15, %bb24 ], [ %phi15, %bb25 ], [ 0, %bb26 ], [ 0, %bb27 ], [ %phi15, %bb28 ], [ %phi15, %bb29 ], [ %phi15, %bb30 ]
+  %phi16 = phi i8 [ 3, %bb ], [ 0, %bb31 ], [ 0, %bb32 ], [ 0, %bb33 ], [ 0, %bb36 ], [ 0, %bb39 ], [ 0, %bb40 ], [ 0, %bb41 ], [ 0, %bb42 ], [ 0, %bb43 ], [ 0, %bb45 ], [ 0, %bb21 ], [ 0, %bb22 ], [ 0, %bb23 ], [ 0, %bb24 ], [ 0, %bb25 ], [ 0, %bb26 ], [ 0, %bb27 ], [ 0, %bb28 ], [ 0, %bb29 ], [ 0, %bb30 ]
+  %phi17 = phi i64 [ 0, %bb ], [ %phi17, %bb31 ], [ %phi17, %bb32 ], [ %phi17, %bb33 ], [ %phi17, %bb36 ], [ 0, %bb39 ], [ %phi17, %bb40 ], [ %phi4, %bb41 ], [ %phi17, %bb42 ], [ 0, %bb43 ], [ %phi17, %bb45 ], [ %phi17, %bb21 ], [ %phi17, %bb22 ], [ %phi17, %bb23 ], [ 0, %bb24 ], [ %phi17, %bb25 ], [ 0, %bb26 ], [ 0, %bb27 ], [ %phi17, %bb28 ], [ %phi17, %bb29 ], [ %phi17, %bb30 ]
+  %phi18 = phi i8 [ 0, %bb ], [ 0, %bb31 ], [ 0, %bb32 ], [ 0, %bb33 ], [ 0, %bb36 ], [ 0, %bb39 ], [ 0, %bb40 ], [ 0, %bb41 ], [ 0, %bb42 ], [ 0, %bb43 ], [ 0, %bb45 ], [ 0, %bb21 ], [ 0, %bb22 ], [ 0, %bb23 ], [ 0, %bb24 ], [ 0, %bb25 ], [ 0, %bb26 ], [ 0, %bb27 ], [ 0, %bb28 ], [ 0, %bb29 ], [ 0, %bb30 ]
+  %phi19 = phi i64 [ 0, %bb ], [ %phi19, %bb31 ], [ %phi19, %bb32 ], [ %phi19, %bb33 ], [ %phi19, %bb36 ], [ 0, %bb39 ], [ %phi19, %bb40 ], [ %phi19, %bb41 ], [ %phi19, %bb42 ], [ 0, %bb43 ], [ %phi19, %bb45 ], [ %phi19, %bb21 ], [ %phi19, %bb22 ], [ %phi19, %bb23 ], [ %phi19, %bb24 ], [ %phi5, %bb25 ], [ 0, %bb26 ], [ 0, %bb27 ], [ %phi19, %bb28 ], [ %phi19, %bb29 ], [ %phi19, %bb30 ]
+  %phi20 = phi i64 [ 0, %bb ], [ %phi20, %bb31 ], [ %ptrtoint, %bb32 ], [ %phi20, %bb33 ], [ %phi20, %bb36 ], [ %phi20, %bb39 ], [ 0, %bb40 ], [ %phi20, %bb41 ], [ 0, %bb42 ], [ %phi20, %bb43 ], [ %phi20, %bb45 ], [ %phi20, %bb21 ], [ %phi7, %bb22 ], [ %phi20, %bb23 ], [ %phi20, %bb24 ], [ %phi20, %bb25 ], [ 0, %bb26 ], [ %phi20, %bb27 ], [ %phi20, %bb28 ], [ %phi20, %bb29 ], [ %phi20, %bb30 ]
+  switch i8 0, label %bb30 [
+  i8 9, label %bb26
+  i8 0, label %bb29
+  i8 13, label %bb28
+  i8 1, label %bb21
+  i8 3, label %bb22
+  i8 12, label %bb27
+  i8 5, label %bb23
+  i8 6, label %bb24
+  i8 7, label %bb25
+  ]
+
+bb21:
+  br i1 false, label %bb1, label %bb31
+
+bb22:
+  br i1 false, label %bb1, label %bb32
+
+bb23:
+  br i1 false, label %bb1, label %bb33
+
+bb24:
+  br i1 false, label %bb1, label %bb36
+
+bb25:
+  br i1 false, label %bb1, label %bb39
+
+bb26:
+  br i1 false, label %bb40, label %bb1
+
+bb27:
+  br i1 false, label %bb41, label %bb1
+
+bb28:
+  br i1 false, label %bb1, label %bb42
+
+bb29:
+  br i1 false, label %bb1, label %bb43
+
+bb30:
+  br i1 false, label %bb1, label %bb45
+
+bb31:
+  %load = load i64, ptr null, align 8
+  br label %bb1
+
+bb32:
+  %ptrtoint = ptrtoint ptr null to i64
+  br label %bb1
+
+bb33:
+  %load34 = load i64, ptr null, align 8
+  %load35 = load i64, ptr null, align 8
+  br label %bb1
+
+bb36:
+  %load37 = load i64, ptr null, align 8
+  %load38 = load i64, ptr null, align 8
+  br label %bb1
+
+bb39:
+  br label %bb1
+
+bb40:
+  br label %bb1
+
+bb41:
+  br label %bb1
+
+bb42:
+  br label %bb1
+
+bb43:
+  %load44 = load i64, ptr null, align 8
+  br label %bb1
+
+bb45:
+  %load46 = load i64, ptr null, align 8
+  br label %bb1
+}
+
diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
index 9b6511d0d828..d880c6b1783c 100644
--- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
+++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll
@@ -17,12 +17,12 @@ define <2 x i32> @test(i32 %arg) {
 ; AARCH64-LABEL: define <2 x i32> @test(
 ; AARCH64-SAME: i32 [[ARG:%.*]]) {
 ; AARCH64-NEXT:  bb:
-; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 1>, i32 [[ARG]], i32 0
-; AARCH64-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer
-; AARCH64-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; AARCH64-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; AARCH64-NEXT:    [[TMP2:%.*]] = or i32 [[ARG]], 0
+; AARCH64-NEXT:    [[TMP3:%.*]] = mul i32 0, 1
 ; AARCH64-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]]
 ; AARCH64-NEXT:    [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]]
+; AARCH64-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0
+; AARCH64-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP3]], i32 1
 ; AARCH64-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 bb:
diff --git a/llvm/test/Transforms/SROA/invariant-group.ll b/llvm/test/Transforms/SROA/invariant-group.ll
index 1be6f6e2fc32..c9c9e031ca95 100644
--- a/llvm/test/Transforms/SROA/invariant-group.ll
+++ b/llvm/test/Transforms/SROA/invariant-group.ll
@@ -11,10 +11,17 @@ declare i32 @somevalue()
 
 define void @f() {
 ; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[T:%.*]], align 8
+; CHECK-NEXT:    [[A1_I8_INV:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[A]])
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds [[T]], ptr [[A]], i32 0, i32 1
 ; CHECK-NEXT:    [[SV1:%.*]] = call i32 @somevalue()
 ; CHECK-NEXT:    [[SV2:%.*]] = call i32 @somevalue()
-; CHECK-NEXT:    call void @h(i32 [[SV1]])
-; CHECK-NEXT:    call void @h(i32 [[SV2]])
+; CHECK-NEXT:    store i32 [[SV1]], ptr [[A1_I8_INV]], align 4, !invariant.group [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[SV2]], ptr [[A2]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[A1_I8_INV]], align 4, !invariant.group [[META0]]
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[A2]], align 4
+; CHECK-NEXT:    call void @h(i32 [[V1]])
+; CHECK-NEXT:    call void @h(i32 [[V2]])
 ; CHECK-NEXT:    ret void
 ;
   %a = alloca %t
@@ -44,7 +51,7 @@ define void @g() {
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds [[T]], ptr [[A]], i32 0, i32 1
 ; CHECK-NEXT:    [[SV1:%.*]] = call i32 @somevalue()
 ; CHECK-NEXT:    [[SV2:%.*]] = call i32 @somevalue()
-; CHECK-NEXT:    store i32 [[SV1]], ptr [[A1_I8_INV]], align 4, !invariant.group [[META0:![0-9]+]]
+; CHECK-NEXT:    store i32 [[SV1]], ptr [[A1_I8_INV]], align 4, !invariant.group [[META0]]
 ; CHECK-NEXT:    store i32 [[SV2]], ptr [[A2]], align 4
 ; CHECK-NEXT:    [[V1:%.*]] = load i32, ptr [[A1_I8_INV]], align 4, !invariant.group [[META0]]
 ; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[A2]], align 4
@@ -81,6 +88,9 @@ define void @g() {
 
 define void @store_and_launder() {
 ; CHECK-LABEL: @store_and_launder(
+; CHECK-NEXT:    [[VALPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 0, ptr [[VALPTR]], align 4
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[VALPTR]])
 ; CHECK-NEXT:    ret void
 ;
   %valptr = alloca i32, align 4
@@ -91,7 +101,10 @@ define void @store_and_launder() {
 
 define i32 @launder_and_load() {
 ; CHECK-LABEL: @launder_and_load(
-; CHECK-NEXT:    ret i32 undef
+; CHECK-NEXT:    [[VALPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[VALPTR]])
+; CHECK-NEXT:    [[V2:%.*]] = load i32, ptr [[VALPTR]], align 4
+; CHECK-NEXT:    ret i32 [[V2]]
 ;
   %valptr = alloca i32, align 4
   %barr = call ptr @llvm.launder.invariant.group.p0(ptr %valptr)
@@ -101,6 +114,9 @@ define i32 @launder_and_load() {
 
 define void @launder_and_ptr_arith() {
 ; CHECK-LABEL: @launder_and_ptr_arith(
+; CHECK-NEXT:    [[VALPTR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[VALPTR]])
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i32, ptr [[VALPTR]], i32 0
 ; CHECK-NEXT:    ret void
 ;
   %valptr = alloca i32, align 4
@@ -140,9 +156,13 @@ end:
 
 define void @partial_promotion_of_alloca() {
 ; CHECK-LABEL: @partial_promotion_of_alloca(
-; CHECK-NEXT:    [[STRUCT_PTR_SROA_2:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store volatile i32 0, ptr [[STRUCT_PTR_SROA_2]], align 4
-; CHECK-NEXT:    [[STRUCT_PTR_SROA_2_0_STRUCT_PTR_SROA_2_4_LOAD_VAL:%.*]] = load volatile i32, ptr [[STRUCT_PTR_SROA_2]], align 4
+; CHECK-NEXT:    [[STRUCT_PTR:%.*]] = alloca [[T:%.*]], align 4
+; CHECK-NEXT:    [[FIELD_PTR:%.*]] = getelementptr inbounds [[T]], ptr [[STRUCT_PTR]], i32 0, i32 0
+; CHECK-NEXT:    store i32 0, ptr [[FIELD_PTR]], align 4
+; CHECK-NEXT:    [[VOLATILE_FIELD_PTR:%.*]] = getelementptr inbounds [[T]], ptr [[STRUCT_PTR]], i32 0, i32 1
+; CHECK-NEXT:    store volatile i32 0, ptr [[VOLATILE_FIELD_PTR]], align 4, !invariant.group [[META0]]
+; CHECK-NEXT:    [[BARR:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[STRUCT_PTR]])
+; CHECK-NEXT:    [[LOAD_VAL:%.*]] = load volatile i32, ptr [[VOLATILE_FIELD_PTR]], align 4, !invariant.group [[META0]]
 ; CHECK-NEXT:    ret void
 ;
   %struct_ptr = alloca %t, align 4
@@ -155,6 +175,61 @@ define void @partial_promotion_of_alloca() {
   ret void
 }
 
+define void @memcpy_after_laundering_alloca(ptr %ptr) {
+; CHECK-LABEL: @memcpy_after_laundering_alloca(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca { i64, i64 }, align 8
+; CHECK-NEXT:    [[LAUNDER:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[ALLOCA]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[LAUNDER]], ptr [[PTR:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca { i64, i64 }, align 8
+  %launder = call ptr @llvm.launder.invariant.group.p0(ptr %alloca)
+  call void @llvm.memcpy.p0.p0.i64(ptr %launder, ptr %ptr, i64 16, i1 false)
+  ret void
+}
+
+define void @memcpy_after_laundering_alloca_slices(ptr %ptr) {
+; CHECK-LABEL: @memcpy_after_laundering_alloca_slices(
+; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca { [16 x i8], i64, [16 x i8] }, align 8
+; CHECK-NEXT:    [[LAUNDER:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[ALLOCA]])
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[LAUNDER]], i64 16
+; CHECK-NEXT:    store i64 0, ptr [[GEP]], align 4
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[LAUNDER]], ptr [[PTR:%.*]], i64 40, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %alloca = alloca { [16 x i8], i64, [16 x i8] }, align 8
+  %launder = call ptr @llvm.launder.invariant.group.p0(ptr %alloca)
+  %gep = getelementptr i8, ptr %launder, i64 16
+  store i64 0, ptr %gep
+  call void @llvm.memcpy.p0.p0.i64(ptr %launder, ptr %ptr, i64 40, i1 false)
+  ret void
+}
+
+define void @test_agg_store() {
+; CHECK-LABEL: @test_agg_store(
+; CHECK-NEXT:    [[STRUCT_PTR:%.*]] = alloca [[T:%.*]], i64 1, align 4
+; CHECK-NEXT:    [[STRUCT_PTR_FRESH:%.*]] = call ptr @llvm.launder.invariant.group.p0(ptr [[STRUCT_PTR]])
+; CHECK-NEXT:    [[STRUCT:%.*]] = call [[T]] @[[MAKE_T:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]()
+; CHECK-NEXT:    store [[T]] [[STRUCT]], ptr [[STRUCT_PTR_FRESH]], align 4, !invariant.group [[META0]]
+; CHECK-NEXT:    [[FIRST_PTR:%.*]] = getelementptr [[T]], ptr [[STRUCT_PTR_FRESH]], i32 0, i32 0
+; CHECK-NEXT:    [[FIRST:%.*]] = load i32, ptr [[FIRST_PTR]], align 4
+; CHECK-NEXT:    [[SECOND_PTR:%.*]] = getelementptr [[T]], ptr [[STRUCT_PTR_FRESH]], i32 0, i32 1
+; CHECK-NEXT:    [[SECOND:%.*]] = load i32, ptr [[SECOND_PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  %struct_ptr = alloca %t, i64 1, align 4
+  %struct_ptr_fresh = call ptr @llvm.launder.invariant.group.p0(ptr %struct_ptr)
+  %struct = call %t @make_t()
+  store %t %struct, ptr %struct_ptr_fresh, align 4, !invariant.group !0
+  %first_ptr = getelementptr %t, ptr %struct_ptr_fresh, i32 0, i32 0
+  %first = load i32, ptr %first_ptr, align 4
+  %second_ptr = getelementptr %t, ptr %struct_ptr_fresh, i32 0, i32 1
+  %second = load i32, ptr %second_ptr, align 4
+  ret void
+}
+
+declare %t @make_t()
+
 declare void @use(ptr)
 
 !0 = !{}
diff --git a/llvm/test/Transforms/ThinLTOBitcodeWriter/cfi-check.ll b/llvm/test/Transforms/ThinLTOBitcodeWriter/cfi-check.ll
new file mode 100644
index 000000000000..b927af6b92f7
--- /dev/null
+++ b/llvm/test/Transforms/ThinLTOBitcodeWriter/cfi-check.ll
@@ -0,0 +1,19 @@
+; RUN: opt -thinlto-bc -thinlto-split-lto-unit -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+; Check that __cfi_check is emitted on the full LTO side with
+; attributes preserved.
+
+; M0: define void @f()
+define void @f() !type !{!"f1", i32 0} {
+  ret void 
+}
+
+; M1: define void @__cfi_check() #0
+define void @__cfi_check() #0 {
+  ret void
+}
+
+; M1: attributes #0 = { "branch-target-enforcement" }
+attributes #0 = { "branch-target-enforcement" }
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index e6e5f5196d3d..5c035d29a7ea 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -669,10 +669,10 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) {
 ; Scalarizing the load for multiple constant indices may not be profitable.
 define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx(
-; CHECK-NEXT:    [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP1]], i32 0, i32 1
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <4 x i32>, ptr %x
@@ -686,10 +686,10 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) {
 ; because the vector large vector requires 2 vector registers.
 define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) {
 ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable(
-; CHECK-NEXT:    [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> <i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]]
-; CHECK-NEXT:    [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[E_0:%.*]] = load i32, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i32>, ptr [[TMP1]], i32 0, i32 6
+; CHECK-NEXT:    [[E_1:%.*]] = load i32, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[E_0]], [[E_1]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
   %lv = load <8 x i32>, ptr %x, align 16
diff --git a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
index 5f3229398792..ad5f5a7107c2 100644
--- a/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/intrinsic-scalarize.ll
@@ -13,8 +13,7 @@ define <2 x float> @maxnum(float %x, float %y) {
 ; AVX2-LABEL: define <2 x float> @maxnum(
 ; AVX2-SAME: float [[X:%.*]], float [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AVX2-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.maxnum.f32(float [[X]], float [[Y]])
-; AVX2-NEXT:    [[TMP1:%.*]] = call <2 x float> @llvm.maxnum.v2f32(<2 x float> poison, <2 x float> poison)
-; AVX2-NEXT:    [[V:%.*]] = insertelement <2 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; AVX2-NEXT:    [[V:%.*]] = insertelement <2 x float> poison, float [[V_SCALAR]], i64 0
 ; AVX2-NEXT:    ret <2 x float> [[V]]
 ;
   %x.insert = insertelement <2 x float> poison, float %x, i32 0
diff --git a/llvm/test/Transforms/VectorCombine/binop-scalarize.ll b/llvm/test/Transforms/VectorCombine/binop-scalarize.ll
index 52a706a0b59a..bc07f8b08649 100644
--- a/llvm/test/Transforms/VectorCombine/binop-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/binop-scalarize.ll
@@ -20,3 +20,22 @@ define <4 x i8> @udiv_ub(i8 %x, i8 %y) {
   %v = udiv <4 x i8> %x.insert, %y.insert
   ret <4 x i8> %v
 }
+
+
+; Unfoldable constant expression may cause infinite loop between 
+; scalarizing insertelement and folding binop(insert(x,a,idx),insert(y,b,idx))
+@val = external hidden global ptr, align 8
+
+define <2 x i64> @pr153012(i64 %idx) #0 {
+; CHECK-LABEL: define <2 x i64> @pr153012(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[A:%.*]] = insertelement <2 x i64> <i64 5, i64 ptrtoint (ptr @val to i64)>, i64 [[IDX]], i32 0
+; CHECK-NEXT:    [[B:%.*]] = or disjoint <2 x i64> splat (i64 2), [[A]]
+; CHECK-NEXT:    ret <2 x i64> [[B]]
+;
+entry:
+  %a = insertelement <2 x i64> <i64 5, i64 ptrtoint (ptr @val to i64)>, i64 %idx, i32 0
+  %b = or disjoint <2 x i64> splat (i64 2), %a
+  ret <2 x i64> %b
+}
diff --git a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
index 9e43a28bf1e5..abd98a4dc64b 100644
--- a/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
+++ b/llvm/test/Transforms/VectorCombine/intrinsic-scalarize.ll
@@ -5,8 +5,7 @@ define <4 x i32> @umax_fixed(i32 %x, i32 %y) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -19,8 +18,7 @@ define <vscale x 4 x i32> @umax_scalable(i32 %x, i32 %y) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable(
 ; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -33,8 +31,7 @@ define <4 x i32> @umax_fixed_lhs_const(i32 %x) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed_lhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 1, i32 [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -46,8 +43,7 @@ define <4 x i32> @umax_fixed_rhs_const(i32 %x) {
 ; CHECK-LABEL: define <4 x i32> @umax_fixed_rhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x i32> [[V]]
 ;
   %x.insert = insertelement <4 x i32> poison, i32 %x, i32 0
@@ -59,8 +55,7 @@ define <vscale x 4 x i32> @umax_scalable_lhs_const(i32 %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_lhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 42, i32 [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -72,8 +67,7 @@ define <vscale x 4 x i32> @umax_scalable_rhs_const(i32 %x) {
 ; CHECK-LABEL: define <vscale x 4 x i32> @umax_scalable_rhs_const(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call i32 @llvm.umax.i32(i32 [[X]], i32 42)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.umax.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> splat (i32 42))
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> [[TMP1]], i32 [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0
@@ -100,8 +94,7 @@ define <4 x float> @fabs_fixed(float %x) {
 ; CHECK-LABEL: define <4 x float> @fabs_fixed(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[V]]
 ;
   %x.insert = insertelement <4 x float> poison, float %x, i32 0
@@ -113,8 +106,7 @@ define <vscale x 4 x float> @fabs_scalable(float %x) {
 ; CHECK-LABEL: define <vscale x 4 x float> @fabs_scalable(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x float> poison, float %x, i32 0
@@ -126,8 +118,7 @@ define <4 x float> @fma_fixed(float %x, float %y, float %z) {
 ; CHECK-LABEL: define <4 x float> @fma_fixed(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> poison, <4 x float> poison, <4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[V]]
 ;
   %x.insert = insertelement <4 x float> poison, float %x, i32 0
@@ -141,8 +132,7 @@ define <vscale x 4 x float> @fma_scalable(float %x, float %y, float %z) {
 ; CHECK-LABEL: define <vscale x 4 x float> @fma_scalable(
 ; CHECK-SAME: float [[X:%.*]], float [[Y:%.*]], float [[Z:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.fma.f32(float [[X]], float [[Y]], float [[Z]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <vscale x 4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <vscale x 4 x float> [[V]]
 ;
   %x.insert = insertelement <vscale x 4 x float> poison, float %x, i32 0
@@ -156,8 +146,7 @@ define <4 x float> @scalar_argument(float %x) {
 ; CHECK-LABEL: define <4 x float> @scalar_argument(
 ; CHECK-SAME: float [[X:%.*]]) {
 ; CHECK-NEXT:    [[V_SCALAR:%.*]] = call float @llvm.powi.f32.i32(float [[X]], i32 42)
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.powi.v4f32.i32(<4 x float> poison, i32 42)
-; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> [[TMP1]], float [[V_SCALAR]], i64 0
+; CHECK-NEXT:    [[V:%.*]] = insertelement <4 x float> poison, float [[V_SCALAR]], i64 0
 ; CHECK-NEXT:    ret <4 x float> [[V]]
 ;
   %x.insert = insertelement <4 x float> poison, float %x, i32 0
diff --git a/llvm/test/tools/llvm-objcopy/COFF/exe-bogus-assoc.test b/llvm/test/tools/llvm-objcopy/COFF/exe-bogus-assoc.test
new file mode 100644
index 000000000000..12f14b5d58e1
--- /dev/null
+++ b/llvm/test/tools/llvm-objcopy/COFF/exe-bogus-assoc.test
@@ -0,0 +1,134 @@
+## Test that bogus associative section symbols in executables are ignored.
+##
+## The executable contains two (bogus) associative section symbols, both for
+## (parts of) the .rdata section; one pointing at the .debug_info section
+## (which will be stripped out) and one pointing at a nonexistent section.
+##
+## Check that stripping does succeed, and that it doesn't end up removing
+## the .rdata section.
+
+# RUN: yaml2obj %s -o %t.in.exe
+
+# RUN: llvm-strip --strip-debug %t.in.exe -o %t.out.exe
+# RUN: llvm-readobj --sections %t.out.exe | FileCheck %s
+
+# CHECK: Name: .rdata
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 4096
+  ImageBase:       5368709120
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 4
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 5
+  MinorSubsystemVersion: 2
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: [  ]
+  SizeOfStackReserve: 2097152
+  SizeOfStackCommit: 4096
+  SizeOfHeapReserve: 1048576
+  SizeOfHeapCommit: 4096
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  4096
+    VirtualSize:     48
+    SectionData:     E806000000E802000000C3C3C30F1F00FFFFFFFFFFFFFFFF0000000000000000FFFFFFFFFFFFFFFF0000000000000000
+    SizeOfRawData:   512
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  8192
+    VirtualSize:     4
+    SectionData:     '00000000'
+    SizeOfRawData:   512
+  - Name:            .debug_info
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_DISCARDABLE, IMAGE_SCN_MEM_READ ]
+    VirtualAddress:  16384
+    VirtualSize:     4
+    SectionData:     '00000000'
+    SizeOfRawData:   512
+symbols:
+  - Name:            .text
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          11
+      NumberOfRelocations: 2
+      NumberOfLinenumbers: 0
+      CheckSum:        1703692295
+      Number:          1
+  - Name:            '.text$func1'
+    Value:           11
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        40735498
+      Number:          3
+      Selection:       IMAGE_COMDAT_SELECT_ANY
+  - Name:            .rdata
+    Value:           0
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          3
+      Selection:       IMAGE_COMDAT_SELECT_ASSOCIATIVE
+  - Name:            '.text$func2'
+    Value:           12
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        40735498
+      Number:          4
+      Selection:       IMAGE_COMDAT_SELECT_ANY
+  - Name:            .rdata
+    Value:           1
+    SectionNumber:   2
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          1
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          4
+      Selection:       IMAGE_COMDAT_SELECT_ASSOCIATIVE
+  - Name:            .debug_info
+    Value:           0
+    SectionNumber:   3
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_NULL
+    StorageClass:    IMAGE_SYM_CLASS_STATIC
+    SectionDefinition:
+      Length:          4
+      NumberOfRelocations: 0
+      NumberOfLinenumbers: 0
+      CheckSum:        0
+      Number:          0
+...
diff --git a/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
new file mode 100644
index 000000000000..6a4927e4af2a
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/ELF/Hexagon/hexagon-bundles.s
@@ -0,0 +1,47 @@
+/// Checks that various hexagon scenarios are handled correctly:
+///  - branch targets
+///  - endloops
+///  - inline-relocs
+///  - multi-insn bundles
+
+{
+  r6 = sub(r1, r0)
+  r7 = and(r4, #0x0)
+  if (p1) jump:t target1
+  if (p2) jump:nt target2
+}
+
+{
+  r8 = r7
+  r9 = add(r8, #0)
+  r10 = memw(r9)
+} :endloop0
+
+{ jump ##sym }
+
+target1:
+  nop
+
+target2:
+  nop
+
+// RUN: llvm-mc %s --triple=hexagon -filetype=obj | llvm-objdump -d -r - | FileCheck %s
+
+//      CHECK: 00000000 <.text>:
+// CHECK-NEXT:        0:       12 51 00 5c     5c005112 {      if (p1) jump:t 0x24 <target1>
+// CHECK-NEXT:        4:       14 42 00 5c     5c004214        if (p2) jump:nt 0x28 <target2>
+// CHECK-NEXT:        8:       06 41 20 f3     f3204106        r6 = sub(r1,r0)
+// CHECK-NEXT:        c:       07 c0 04 76     7604c007        r7 = and(r4,#0x0) }
+// CHECK-NEXT:       10:       08 80 67 70     70678008 {      r8 = r7
+// CHECK-NEXT:       14:       09 40 08 b0     b0084009        r9 = add(r8,#0x0)
+// CHECK-NEXT:       18:       0a c0 89 91     9189c00a        r10 = memw(r9+#0x0) }  :endloop0
+// CHECK-NEXT:       1c:       00 40 00 00     00004000 {      immext(#0x0)
+// CHECK-NEXT:                         0000001c:  R_HEX_B32_PCREL_X    sym
+// CHECK-NEXT:       20:       00 c0 00 58     5800c000        jump 0x1c <.text+0x1c> }
+// CHECK-NEXT:                         00000020:  R_HEX_B22_PCREL_X    sym+0x4
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000024 <target1>:
+// CHECK-NEXT:       24:       00 c0 00 7f     7f00c000 {      nop }
+// CHECK-EMPTY:
+// CHECK-NEXT: 00000028 <target2>:
+// CHECK-NEXT:       28:       00 c0 00 7f     7f00c000 {      nop }
diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp
index 607184e3b724..86727931067a 100644
--- a/llvm/tools/llvm-mc/Disassembler.cpp
+++ b/llvm/tools/llvm-mc/Disassembler.cpp
@@ -45,7 +45,11 @@ static bool PrintInsts(const MCDisassembler &DisAsm, const ByteArrayTy &Bytes,
     MCInst Inst;
 
     MCDisassembler::DecodeStatus S;
-    S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
+    if (STI.getTargetTriple().getArch() == Triple::hexagon)
+      S = DisAsm.getInstructionBundle(Inst, Size, Data.slice(Index), Index,
+                                      nulls());
+    else
+      S = DisAsm.getInstruction(Inst, Size, Data.slice(Index), Index, nulls());
     switch (S) {
     case MCDisassembler::Fail:
       SM.PrintMessage(SMLoc::getFromPointer(Bytes.second[Index]),
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index c5967cd090ee..74eb9033c8e2 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -693,6 +693,30 @@ class PrettyPrinter {
     } else
       OS << "\t<unknown>";
   }
+
+  virtual void emitPostInstructionInfo(formatted_raw_ostream &FOS,
+                                       const MCAsmInfo &MAI,
+                                       const MCSubtargetInfo &STI,
+                                       StringRef Comments,
+                                       LiveVariablePrinter &LVP) {
+    do {
+      if (!Comments.empty()) {
+        // Emit a line of comments.
+        StringRef Comment;
+        std::tie(Comment, Comments) = Comments.split('\n');
+        // MAI.getCommentColumn() assumes that instructions are printed at the
+        // position of 8, while getInstStartColumn() returns the actual
+        // position.
+        unsigned CommentColumn =
+            MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
+        FOS.PadToColumn(CommentColumn);
+        FOS << MAI.getCommentString() << ' ' << Comment;
+      }
+      LVP.printAfterInst(FOS);
+      FOS << "\n";
+    } while (!Comments.empty());
+    FOS.flush();
+  }
 };
 PrettyPrinter PrettyPrinterInst;
 
@@ -714,6 +738,35 @@ class HexagonPrettyPrinter : public PrettyPrinter {
       }
     }
   }
+
+  std::string getInstructionSeparator() const {
+    SmallString<40> Separator;
+    raw_svector_ostream OS(Separator);
+    if (ShouldClosePacket) {
+      OS << " }";
+      if (IsLoop0 || IsLoop1)
+        OS << "  ";
+      if (IsLoop0)
+        OS << (IsLoop1 ? ":endloop01" : ":endloop0");
+      else if (IsLoop1)
+        OS << ":endloop1";
+    }
+    OS << '\n';
+    return OS.str().str();
+  }
+
+  void emitPostInstructionInfo(formatted_raw_ostream &FOS, const MCAsmInfo &MAI,
+                               const MCSubtargetInfo &STI, StringRef Comments,
+                               LiveVariablePrinter &LVP) override {
+    // Hexagon does not write anything to the comment stream, so we can just
+    // print the separator.
+    LVP.printAfterInst(FOS);
+    FOS << getInstructionSeparator();
+    FOS.flush();
+    if (ShouldClosePacket)
+      reset();
+  }
+
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  object::SectionedAddress Address, formatted_raw_ostream &OS,
                  StringRef Annot, MCSubtargetInfo const &STI, SourcePrinter *SP,
@@ -724,60 +777,64 @@ class HexagonPrettyPrinter : public PrettyPrinter {
     if (!MI) {
       printLead(Bytes, Address.Address, OS);
       OS << " <unknown>";
+      reset();
       return;
     }
-    std::string Buffer;
+
+    StringRef Preamble = IsStartOfBundle ? " { " : "   ";
+
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
+    printLead(Bytes, Address.Address, OS);
+    OS << Preamble;
+    std::string Buf;
     {
-      raw_string_ostream TempStream(Buffer);
+      raw_string_ostream TempStream(Buf);
       IP.printInst(MI, Address.Address, "", STI, TempStream);
     }
-    StringRef Contents(Buffer);
-    // Split off bundle attributes
-    auto PacketBundle = Contents.rsplit('\n');
-    // Split off first instruction from the rest
-    auto HeadTail = PacketBundle.first.split('\n');
-    auto Preamble = " { ";
-    auto Separator = "";
-
-    // Hexagon's packets require relocations to be inline rather than
-    // clustered at the end of the packet.
-    std::vector<RelocationRef>::const_iterator RelCur = Rels->begin();
-    std::vector<RelocationRef>::const_iterator RelEnd = Rels->end();
-    auto PrintReloc = [&]() -> void {
-      while ((RelCur != RelEnd) && (RelCur->getOffset() <= Address.Address)) {
-        if (RelCur->getOffset() == Address.Address) {
-          printRelocation(OS, ObjectFilename, *RelCur, Address.Address, false);
-          return;
-        }
-        ++RelCur;
-      }
-    };
+    StringRef Contents(Buf);
+
+    auto Duplex = Contents.split('\v');
+    bool HasDuplex = !Duplex.second.empty();
+    if (HasDuplex) {
+      OS << Duplex.first;
+      OS << "; ";
+      OS << Duplex.second;
+    } else {
+      OS << Duplex.first;
+    }
 
-    while (!HeadTail.first.empty()) {
-      OS << Separator;
-      Separator = "\n";
-      if (SP && (PrintSource || PrintLines))
-        SP->printSourceLine(OS, Address, ObjectFilename, LVP, "");
-      printLead(Bytes, Address.Address, OS);
-      OS << Preamble;
-      Preamble = "   ";
-      StringRef Inst;
-      auto Duplex = HeadTail.first.split('\v');
-      if (!Duplex.second.empty()) {
-        OS << Duplex.first;
-        OS << "; ";
-        Inst = Duplex.second;
-      }
+    uint32_t Instruction = support::endian::read32le(Bytes.data());
+
+    uint32_t ParseMask = 0x0000c000;
+    uint32_t PacketEndMask = 0x0000c000;
+    uint32_t LoopEndMask = 0x00008000;
+    uint32_t ParseBits = Instruction & ParseMask;
+
+    if (ParseBits == LoopEndMask) {
+      if (IsStartOfBundle)
+        IsLoop0 = true;
       else
-        Inst = HeadTail.first;
-      OS << Inst;
-      HeadTail = HeadTail.second.split('\n');
-      if (HeadTail.first.empty())
-        OS << " } " << PacketBundle.second;
-      PrintReloc();
-      Bytes = Bytes.slice(4);
-      Address.Address += 4;
+        IsLoop1 = true;
     }
+
+    IsStartOfBundle = false;
+
+    if (ParseBits == PacketEndMask || HasDuplex)
+      ShouldClosePacket = true;
+  }
+
+private:
+  bool IsStartOfBundle = true;
+  bool IsLoop0 = false;
+  bool IsLoop1 = false;
+  bool ShouldClosePacket = false;
+
+  void reset() {
+    IsStartOfBundle = true;
+    IsLoop0 = false;
+    IsLoop1 = false;
+    ShouldClosePacket = false;
   }
 };
 HexagonPrettyPrinter HexagonPrettyPrinterInst;
@@ -1610,29 +1667,6 @@ static StringRef getSegmentName(const MachOObjectFile *MachO,
   return "";
 }
 
-static void emitPostInstructionInfo(formatted_raw_ostream &FOS,
-                                    const MCAsmInfo &MAI,
-                                    const MCSubtargetInfo &STI,
-                                    StringRef Comments,
-                                    LiveVariablePrinter &LVP) {
-  do {
-    if (!Comments.empty()) {
-      // Emit a line of comments.
-      StringRef Comment;
-      std::tie(Comment, Comments) = Comments.split('\n');
-      // MAI.getCommentColumn() assumes that instructions are printed at the
-      // position of 8, while getInstStartColumn() returns the actual position.
-      unsigned CommentColumn =
-          MAI.getCommentColumn() - 8 + getInstStartColumn(STI);
-      FOS.PadToColumn(CommentColumn);
-      FOS << MAI.getCommentString() << ' ' << Comment;
-    }
-    LVP.printAfterInst(FOS);
-    FOS << '\n';
-  } while (!Comments.empty());
-  FOS.flush();
-}
-
 static void createFakeELFSections(ObjectFile &Obj) {
   assert(Obj.isELF());
   if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(&Obj))
@@ -2526,15 +2560,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         }
 
         assert(DT->Context->getAsmInfo());
-        emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
-                                *DT->SubtargetInfo, CommentStream.str(), LVP);
+        DT->Printer->emitPostInstructionInfo(FOS, *DT->Context->getAsmInfo(),
+                                             *DT->SubtargetInfo,
+                                             CommentStream.str(), LVP);
         Comments.clear();
 
         if (BTF)
           printBTFRelocation(FOS, *BTF, {Index, Section.getIndex()}, LVP);
 
-        // Hexagon handles relocs in pretty printer
-        if (InlineRelocs && Obj.getArch() != Triple::hexagon) {
+        if (InlineRelocs) {
           while (findRel()) {
             // When --adjust-vma is used, update the address printed.
             printRelocation(FOS, Obj.getFileName(), *RelCur,
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 7a48105a1dc9..bf396499e35c 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/FloatingPointPredicateUtils.h"
 #include "llvm/AsmParser/Parser.h"
@@ -2208,6 +2209,41 @@ TEST_F(ComputeKnownFPClassTest, Constants) {
   }
 }
 
+TEST_F(ComputeKnownFPClassTest, fcmpImpliesClass_fabs_zero) {
+  parseAssembly("define float @test(float %x) {\n"
+                "  %A = call float @llvm.fabs.f32(float %x)\n"
+                "  ret float %A\n"
+                "}\n");
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OEQ, *F, A, fcZero)),
+            fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UEQ, *F, A, fcZero)),
+            fcZero | fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UNE, *F, A, fcZero)),
+            ~fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ONE, *F, A, fcZero)),
+            ~fcNan & ~fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ORD, *F, A, fcZero)),
+            ~fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UNO, *F, A, fcZero)),
+            fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OGT, *F, A, fcZero)),
+            fcSubnormal | fcNormal | fcInf);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UGT, *F, A, fcZero)),
+            fcSubnormal | fcNormal | fcInf | fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OGE, *F, A, fcZero)),
+            ~fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_UGE, *F, A, fcZero)),
+            fcAllFlags);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OLT, *F, A, fcZero)),
+            fcNone);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ULT, *F, A, fcZero)),
+            fcNan);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_OLE, *F, A, fcZero)),
+            fcZero);
+  EXPECT_EQ(std::get<1>(fcmpImpliesClass(FCmpInst::FCMP_ULE, *F, A, fcZero)),
+            fcZero | fcNan);
+}
+
 TEST_F(ValueTrackingTest, isNonZeroRecurrence) {
   parseAssembly(R"(
     define i1 @test(i8 %n, i8 %r) {
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 36408de7802c..8f733bfaf2ea 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -126,6 +126,11 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::Hurd, T.getOS());
   EXPECT_EQ(Triple::GNU, T.getEnvironment());
 
+  T = Triple("aarch64-amazon-linux");
+  EXPECT_EQ(Triple::aarch64, T.getArch());
+  EXPECT_EQ(Triple::Amazon, T.getVendor());
+  EXPECT_EQ(Triple::Linux, T.getOS());
+
   T = Triple("arm-unknown-linux-android16");
   EXPECT_EQ(Triple::arm, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
@@ -1436,6 +1441,11 @@ TEST(TripleTest, Normalization) {
 
   EXPECT_EQ("x86_64-unknown-linux-gnu", Triple::normalize("x86_64-gnu-linux"));
 
+  EXPECT_EQ("aarch64-amazon-linux-gnu",
+            Triple::normalize("aarch64-amazon-linux"));
+  EXPECT_EQ("x86_64-amazon-linux-gnu",
+            Triple::normalize("x86_64-amazon-linux"));
+
   EXPECT_EQ("a-unknown-unknown",
             Triple::normalize("a", Triple::CanonicalForm::THREE_IDENT));
   EXPECT_EQ("a-b-unknown",
diff --git a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
index 8f7beea152ab..30b8bb61184b 100644
--- a/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/tools/libclang/BUILD.gn
@@ -87,6 +87,7 @@ shared_library("libclang") {
     "Index_Internal.h",
     "Indexing.cpp",
     "Rewrite.cpp",
+    "Obsolete.cpp",
   ]
   if (host_os == "mac") {
     ldflags = [
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index 2b1a9076afe4..c489f0b49cff 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 21
-llvm_version_minor = 0
-llvm_version_patch = 0
+llvm_version_minor = 1
+llvm_version_patch = 1
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index b5aa8edc03dc..4904e829afcf 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (21, 0, 0)
+__versioninfo__ = (21, 1, 1)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
index d3369abae70b..b00a2d39ba6b 100644
--- a/llvm/utils/mlgo-utils/mlgo/__init__.py
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -4,7 +4,7 @@
 
 from datetime import timezone, datetime
 
-__versioninfo__ = (20, 0, 0)
+__versioninfo__ = (21, 1, 1)
 __version__ = (
     ".".join(str(v) for v in __versioninfo__)
     + "dev"
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 3042fc2d77dd..54645d0c6369 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -169,9 +169,9 @@ set common_cmake_flags=^
 
 if "%force-msvc%" == "" (
   where /q clang-cl
-  if errorlevel 0 (
+  if %errorlevel% EQU 0 (
     where /q lld-link
-    if errorlevel 0 (
+    if %errorlevel% EQU 0 (
       set common_compiler_flags=%common_compiler_flags% -fuse-ld=lld
       
       set common_cmake_flags=%common_cmake_flags%^
diff --git a/llvm/utils/release/export.sh b/llvm/utils/release/export.sh
index 66bef82586a3..0ac392cbed7b 100755
--- a/llvm/utils/release/export.sh
+++ b/llvm/utils/release/export.sh
@@ -123,7 +123,7 @@ export_sources() {
                 tar -C test-suite-$release$rc.src --strip-components=1 -xzf -
         fi
         echo "Creating tarball for test-suite ..."
-        tar --sort=name --owner=0 --group=0 \
+        XZ_OPT="-T0" tar --sort=name --owner=0 --group=0 \
             --pax-option=exthdr.name=%d/PaxHeaders/%f,delete=atime,delete=ctime \
             -cJf test-suite-$release$rc.src.tar.xz test-suite-$release$rc.src
     fi
diff --git a/llvm/utils/release/github-upload-release.py b/llvm/utils/release/github-upload-release.py
index 90c222d1175c..d58bb544e17d 100755
--- a/llvm/utils/release/github-upload-release.py
+++ b/llvm/utils/release/github-upload-release.py
@@ -45,19 +45,39 @@ def create_release(repo, release, tag=None, name=None, message=None):
         # Note that these lines are not length limited because if we do so, GitHub
         # assumes that should be how it is laid out on the page. We want GitHub to
         # do the reflowing for us instead.
+        #
+        # Once all the atuomatic binary builds have completed, the HTML comments
+        # with UPPERCASE markers in them will be removed to reveal the download
+        # links later. Other lines are surrounded in <!-- --> for release uploaders
+        # to manually uncomment when they upload that package.
         message = dedent(
             """\
-LLVM {release} Release
+## LLVM {release} Release
 
-## Package Types
+<!-- AUTOMATIC_DOWNLOAD_LINKS_BEGIN
+* [Linux x86_64](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-X64.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-X64.tar.xz.jsonl))
+* [Linux Arm64](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-ARM64.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-Linux-ARM64.tar.xz.jsonl))
+AUTOMATIC_DOWNLOAD_LINKS_END -->
+<!-- * [Linux Armv7-a](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-armv7a-linux-gnueabihf.tar.gz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-armv7a-linux-gnueabihf.tar.gz.sig)) -->
 
-Each platform has one binary release package. The file name starts with either `LLVM-` or `clang+llvm-` and ends with the platform's name. For example, `LLVM-{release}-Linux-ARM64.tar.xz` contains LLVM binaries for Arm64 Linux.
+<!-- AUTOMATIC_DOWNLOAD_LINKS_BEGIN
+* [macOS Apple Silicon](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-ARM64.tar.xz) (ARM64) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-ARM64.tar.xz.jsonl))
+* [macOS Intel](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-X64.tar.xz) (x86-64) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-macOS-X64.tar.xz.jsonl))
+AUTOMATIC_DOWNLOAD_LINKS_END -->
+
+<!-- * Windows x64 (64-bit): [installer](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win64.exe) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win64.exe.sig)), [archive](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-x86_64-pc-windows-msvc.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-x86_64-pc-windows-msvc.tar.xz.sig)) -->
+<!-- * Windows x86 (32-bit): [installer](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win32.exe) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-win32.exe.sig)) -->
+<!-- * Windows on Arm (ARM64): [installer](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-woa64.exe) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/LLVM-{release}-woa64.exe.sig)), [archive](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-aarch64-pc-windows-msvc.tar.xz) ([signature](https://github.com/llvm/llvm-project/releases/download/llvmorg-{release}/clang+llvm-{release}-aarch64-pc-windows-msvc.tar.xz.sig)) -->
 
-Except for Windows. Where `LLVM-*.exe` is an installer intended for using LLVM as a toolchain and `clang+llvm-` contains the contents of the installer, plus libraries and tools not normally used in a toolchain. You most likely want the `LLVM-` installer, unless you are developing software which itself uses LLVM, in which case choose `clang+llvm-`.
+Download links will appear here once builds have completed. <!-- AUTOMATIC_DOWNLOAD_LINKS_PLACEHOLDER -->
+
+For any other variants of platform and architecture, check the full list of release packages at the bottom of this release page. If you do not find a release package for your platform, you may be able to find a community built package on the LLVM Discourse forum thread for this release. Remember that these are built by volunteers and may not always be available. If you rely on a platform or configuration that is not one of the defaults, we suggest you use the binaries that your platform provides, or build your own release packages.
+
+## Package Types
 
-If you do not find a release package for your platform, you may be able to find a community built package on the LLVM Discourse forum thread for this release. Remember that these are built by volunteers and may not always be available.
+Each platform has one binary release package. The file name starts with either `LLVM-` or `clang+llvm-` and ends with the platform's name. For example, `LLVM-{release}-Linux-ARM64.tar.xz` contains LLVM binaries for Arm64 Linux.
 
-If you rely on a platform or configuration that is not one of the defaults, we suggest you use the binaries that your platform provides, or build your own release packages.
+Except for Windows. Where `LLVM-*.exe` is an installer intended for using LLVM as a toolchain and the archive `clang+llvm-` contains the contents of the installer, plus libraries and tools not normally used in a toolchain. You most likely want the `LLVM-` installer, unless you are developing software which itself uses LLVM, in which case choose `clang+llvm-`.
 
 In addition, source archives are available:
 * `<sub-project>-{release}.src.tar.xz` are archives of the sources of specific sub-projects of `llvm-project` (except for `test-suite` which is an archive of the [LLVM Test Suite](https://github.com/llvm/llvm-test-suite)).
@@ -95,9 +115,35 @@ def upload_files(repo, release, files):
         print("Done")
 
 
+def uncomment_download_links(repo, release):
+    release = repo.get_release("llvmorg-{}".format(release))
+
+    new_message = []
+    to_remove = [
+        "AUTOMATIC_DOWNLOAD_LINKS_BEGIN",
+        "AUTOMATIC_DOWNLOAD_LINKS_END",
+        "AUTOMATIC_DOWNLOAD_LINKS_PLACEHOLDER",
+    ]
+    for line in release.body.splitlines():
+        for comment in to_remove:
+            if comment in line:
+                break
+        else:
+            new_message.append(line)
+
+    release.update_release(
+        name=release.title,
+        message="\n".join(new_message),
+        draft=release.draft,
+        prerelease=release.prerelease,
+    )
+
+
 parser = argparse.ArgumentParser()
 parser.add_argument(
-    "command", type=str, choices=["create", "upload", "check-permissions"]
+    "command",
+    type=str,
+    choices=["create", "upload", "check-permissions", "uncomment_download_links"],
 )
 
 # All args
@@ -137,3 +183,5 @@ def upload_files(repo, release, files):
     create_release(llvm_repo, args.release)
 if args.command == "upload":
     upload_files(llvm_repo, args.release, args.files)
+if args.command == "uncomment_download_links":
+    uncomment_download_links(llvm_repo, args.release)
diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake
index ff4269ed7acd..38839679ef8b 100644
--- a/mlir/cmake/modules/AddMLIR.cmake
+++ b/mlir/cmake/modules/AddMLIR.cmake
@@ -388,6 +388,9 @@ function(add_mlir_library name)
 
   if(TARGET ${name})
     target_link_libraries(${name} INTERFACE ${LLVM_COMMON_LIBS})
+    if(ARG_INSTALL_WITH_TOOLCHAIN)
+      set_target_properties(${name} PROPERTIES MLIR_INSTALL_WITH_TOOLCHAIN TRUE)
+    endif()
     if(NOT ARG_DISABLE_INSTALL)
       add_mlir_library_install(${name})
     endif()
@@ -617,26 +620,27 @@ endfunction(add_mlir_aggregate)
 # This is usually done as part of add_mlir_library but is broken out for cases
 # where non-standard library builds can be installed.
 function(add_mlir_library_install name)
-  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
-  get_target_export_arg(${name} MLIR export_to_mlirtargets UMBRELLA mlir-libraries)
-  install(TARGETS ${name}
-    COMPONENT ${name}
-    ${export_to_mlirtargets}
-    LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-    RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
-    # Note that CMake will create a directory like:
-    #   objects-${CMAKE_BUILD_TYPE}/obj.LibName
-    # and put object files there.
-    OBJECTS DESTINATION lib${LLVM_LIBDIR_SUFFIX}
-  )
+  get_target_property(_install_with_toolchain ${name} MLIR_INSTALL_WITH_TOOLCHAIN)
+  if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY OR _install_with_toolchain)
+    get_target_export_arg(${name} MLIR export_to_mlirtargets UMBRELLA mlir-libraries)
+    install(TARGETS ${name}
+      COMPONENT ${name}
+      ${export_to_mlirtargets}
+      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}"
+      # Note that CMake will create a directory like:
+      #   objects-${CMAKE_BUILD_TYPE}/obj.LibName
+      # and put object files there.
+      OBJECTS DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+    )
 
-  if (NOT LLVM_ENABLE_IDE)
-    add_llvm_install_targets(install-${name}
-                            DEPENDS ${name}
-                            COMPONENT ${name})
-  endif()
-  set_property(GLOBAL APPEND PROPERTY MLIR_ALL_LIBS ${name})
+    if (NOT LLVM_ENABLE_IDE)
+      add_llvm_install_targets(install-${name}
+                              DEPENDS ${name}
+                              COMPONENT ${name})
+    endif()
+    set_property(GLOBAL APPEND PROPERTY MLIR_ALL_LIBS ${name})
   endif()
   set_property(GLOBAL APPEND PROPERTY MLIR_EXPORTS ${name})
 endfunction()
diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index 49862927caff..e364570c8b53 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -354,29 +354,7 @@ class DataFlowSolver {
 
   /// Erase any analysis state associated with the given lattice anchor.
   template <typename AnchorT>
-  void eraseState(AnchorT anchor) {
-    LatticeAnchor latticeAnchor(anchor);
-
-    // Update equivalentAnchorMap.
-    for (auto &&[TypeId, eqClass] : equivalentAnchorMap) {
-      if (!eqClass.contains(latticeAnchor)) {
-        continue;
-      }
-      llvm::EquivalenceClasses<LatticeAnchor>::member_iterator leaderIt =
-          eqClass.findLeader(latticeAnchor);
-
-      // Update analysis states with new leader if needed.
-      if (*leaderIt == latticeAnchor && ++leaderIt != eqClass.member_end()) {
-        analysisStates[*leaderIt][TypeId] =
-            std::move(analysisStates[latticeAnchor][TypeId]);
-      }
-
-      eqClass.erase(latticeAnchor);
-    }
-
-    // Update analysis states.
-    analysisStates.erase(latticeAnchor);
-  }
+  void eraseState(AnchorT anchor);
 
   /// Erase all analysis states.
   void eraseAllStates() {
@@ -560,6 +538,36 @@ class AnalysisState {
   friend class DataFlowSolver;
 };
 
+//===----------------------------------------------------------------------===//
+// DataFlowSolver definition
+//===----------------------------------------------------------------------===//
+// This method is defined outside `DataFlowSolver` and after `AnalysisState`
+// to prevent issues around `AnalysisState` being used before it is defined.
+template <typename AnchorT>
+void DataFlowSolver::eraseState(AnchorT anchor) {
+  LatticeAnchor latticeAnchor(anchor);
+
+  // Update equivalentAnchorMap.
+  for (auto &&[TypeId, eqClass] : equivalentAnchorMap) {
+    if (!eqClass.contains(latticeAnchor)) {
+      continue;
+    }
+    llvm::EquivalenceClasses<LatticeAnchor>::member_iterator leaderIt =
+        eqClass.findLeader(latticeAnchor);
+
+    // Update analysis states with new leader if needed.
+    if (*leaderIt == latticeAnchor && ++leaderIt != eqClass.member_end()) {
+      analysisStates[*leaderIt][TypeId] =
+          std::move(analysisStates[latticeAnchor][TypeId]);
+    }
+
+    eqClass.erase(latticeAnchor);
+  }
+
+  // Update analysis states.
+  analysisStates.erase(latticeAnchor);
+}
+
 //===----------------------------------------------------------------------===//
 // DataFlowAnalysis
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index d082d2d9f758..18349d071bb2 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -65,8 +65,9 @@ using ForwardSliceOptions = SliceOptions;
 ///
 /// The implementation traverses the use chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `forwardSlice`, no
-/// need to traverse its uses again. Since use-def chains form a DAG, this
-/// terminates.
+/// need to traverse its uses again. In the presence of use-def cycles in a
+/// graph region, the traversal stops at the first operation that was already
+/// visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `forwardSlice` is filled with a
 /// postorder list of uses (i.e. a reverse topological order). To get a proper
@@ -114,8 +115,9 @@ void getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 ///
 /// The implementation traverses the def chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `backwardSlice`, no
-/// need to traverse its definitions again. Since useuse-def chains form a DAG,
-/// this terminates.
+/// need to traverse its definitions again. In the presence of use-def cycles
+/// in a graph region, the traversal stops at the first operation that was
+/// already visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `backwardSlice` is filled with a
 /// postorder list of defs. This happens to be a topological order, from the
diff --git a/mlir/include/mlir/AsmParser/AsmParser.h b/mlir/include/mlir/AsmParser/AsmParser.h
index 33daf7ca26f4..f39b3bd853a2 100644
--- a/mlir/include/mlir/AsmParser/AsmParser.h
+++ b/mlir/include/mlir/AsmParser/AsmParser.h
@@ -53,7 +53,8 @@ parseAsmSourceFile(const llvm::SourceMgr &sourceMgr, Block *block,
 /// null terminated.
 Attribute parseAttribute(llvm::StringRef attrStr, MLIRContext *context,
                          Type type = {}, size_t *numRead = nullptr,
-                         bool isKnownNullTerminated = false);
+                         bool isKnownNullTerminated = false,
+                         llvm::StringMap<Attribute> *attributesCache = nullptr);
 
 /// This parses a single MLIR type to an MLIR context if it was valid. If not,
 /// an error diagnostic is emitted to the context.
diff --git a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
index 3f1041cb2510..243dbf081b99 100644
--- a/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
+++ b/mlir/include/mlir/Dialect/Mesh/Transforms/Simplifications.h
@@ -62,9 +62,11 @@ void populateAllReduceEndomorphismSimplificationPatterns(
   auto isEndomorphismOp = [reduction](Operation *op,
                                       std::optional<Operation *> referenceOp) {
     auto allReduceOp = llvm::dyn_cast<AllReduceOp>(op);
+    if (!allReduceOp)
+      return false;
     auto inType = cast<ShapedType>(allReduceOp.getInput().getType());
     auto outType = cast<ShapedType>(allReduceOp.getResult().getType());
-    if (!allReduceOp || inType.getElementType() != outType.getElementType() ||
+    if (inType.getElementType() != outType.getElementType() ||
         allReduceOp.getReduction() != reduction) {
       return false;
     }
@@ -87,9 +89,7 @@ void populateAllReduceEndomorphismSimplificationPatterns(
     return refAllReduceOp->getAttrs() == allReduceOp->getAttrs() &&
            inType.getElementType() == refType.getElementType();
   };
-  auto isAlgebraicOp = [](Operation *op) {
-    return static_cast<bool>(llvm::dyn_cast<AlgebraicOp>(op));
-  };
+  auto isAlgebraicOp = [](Operation *op) { return isa<AlgebraicOp>(op); };
 
   using ConcreteEndomorphismSimplification = EndomorphismSimplification<
       std::decay_t<decltype(getEndomorphismOpOperand)>,
diff --git a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
index 6e72f7c23bdc..d66d757cb7a8 100644
--- a/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/MemRefUtils.h
@@ -151,7 +151,7 @@ class OwningMemRef {
       AllocFunType allocFun = &::malloc,
       std::function<void(StridedMemRefType<T, Rank>)> freeFun =
           [](StridedMemRefType<T, Rank> descriptor) {
-            ::free(descriptor.data);
+            ::free(descriptor.basePtr);
           })
       : freeFunc(freeFun) {
     if (shapeAlloc.empty())
diff --git a/mlir/include/mlir/IR/Operation.h b/mlir/include/mlir/IR/Operation.h
index 1c2c04e718bf..11af3b7d4d7b 100644
--- a/mlir/include/mlir/IR/Operation.h
+++ b/mlir/include/mlir/IR/Operation.h
@@ -1102,6 +1102,49 @@ inline raw_ostream &operator<<(raw_ostream &os, const Operation &op) {
   return os;
 }
 
+/// A wrapper class that allows for printing an operation with a set of flags,
+/// useful to act as a "stream modifier" to customize printing an operation
+/// with a stream using the operator<< overload, e.g.:
+///   llvm::dbgs() << OpWithFlags(op, OpPrintingFlags().skipRegions());
+class OpWithFlags {
+public:
+  OpWithFlags(Operation *op, OpPrintingFlags flags = {})
+      : op(op), theFlags(flags) {}
+  OpPrintingFlags &flags() { return theFlags; }
+  const OpPrintingFlags &flags() const { return theFlags; }
+
+private:
+  Operation *op;
+  OpPrintingFlags theFlags;
+  friend raw_ostream &operator<<(raw_ostream &os, const OpWithFlags &op);
+};
+
+inline raw_ostream &operator<<(raw_ostream &os,
+                               const OpWithFlags &opWithFlags) {
+  opWithFlags.op->print(os, opWithFlags.flags());
+  return os;
+}
+
+/// A wrapper class that allows for printing an operation with a custom
+/// AsmState, useful to act as a "stream modifier" to customize printing an
+/// operation with a stream using the operator<< overload, e.g.:
+///   llvm::dbgs() << OpWithState(op, OpPrintingFlags().skipRegions());
+class OpWithState {
+public:
+  OpWithState(Operation *op, AsmState &state) : op(op), theState(state) {}
+
+private:
+  Operation *op;
+  AsmState &theState;
+  friend raw_ostream &operator<<(raw_ostream &os, const OpWithState &op);
+};
+
+inline raw_ostream &operator<<(raw_ostream &os,
+                               const OpWithState &opWithState) {
+  opWithState.op->print(os, const_cast<OpWithState &>(opWithState).theState);
+  return os;
+}
+
 } // namespace mlir
 
 namespace llvm {
diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index afeb784b85a1..9565eaf4da76 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -311,14 +311,14 @@ struct OpOrInterfaceRewritePatternBase : public RewritePattern {
 /// opposed to a raw Operation.
 template <typename SourceOp>
 struct OpRewritePattern
-    : public detail::OpOrInterfaceRewritePatternBase<SourceOp> {
+    : public mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp> {
 
   /// Patterns must specify the root operation name they match against, and can
   /// also specify the benefit of the pattern matching and a list of generated
   /// ops.
   OpRewritePattern(MLIRContext *context, PatternBenefit benefit = 1,
                    ArrayRef<StringRef> generatedNames = {})
-      : detail::OpOrInterfaceRewritePatternBase<SourceOp>(
+      : mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp>(
             SourceOp::getOperationName(), benefit, context, generatedNames) {}
 };
 
@@ -327,10 +327,10 @@ struct OpRewritePattern
 /// of a raw Operation.
 template <typename SourceOp>
 struct OpInterfaceRewritePattern
-    : public detail::OpOrInterfaceRewritePatternBase<SourceOp> {
+    : public mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp> {
 
   OpInterfaceRewritePattern(MLIRContext *context, PatternBenefit benefit = 1)
-      : detail::OpOrInterfaceRewritePatternBase<SourceOp>(
+      : mlir::detail::OpOrInterfaceRewritePatternBase<SourceOp>(
             Pattern::MatchInterfaceOpTypeTag(), SourceOp::getInterfaceID(),
             benefit, context) {}
 };
diff --git a/mlir/include/mlir/Pass/PassOptions.h b/mlir/include/mlir/Pass/PassOptions.h
index e1f16c6158ad..0c71f78b52d3 100644
--- a/mlir/include/mlir/Pass/PassOptions.h
+++ b/mlir/include/mlir/Pass/PassOptions.h
@@ -377,7 +377,7 @@ class PassOptions : protected llvm::cl::SubCommand {
 ///   ListOption<int> someListFlag{*this, "flag-name", llvm::cl::desc("...")};
 /// };
 template <typename T>
-class PassPipelineOptions : public detail::PassOptions {
+class PassPipelineOptions : public virtual detail::PassOptions {
 public:
   /// Factory that parses the provided options and returns a unique_ptr to the
   /// struct.
diff --git a/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h b/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h
index 6d62ae3dd43d..7d5c1d5cebb2 100644
--- a/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h
+++ b/mlir/include/mlir/Transforms/WalkPatternRewriteDriver.h
@@ -27,6 +27,8 @@ namespace mlir {
 /// This is intended as the simplest and most lightweight pattern rewriter in
 /// cases when a simple walk gets the job done.
 ///
+/// The driver will skip unreachable blocks.
+///
 /// Note: Does not apply patterns to the given operation itself.
 void walkAndApplyPatterns(Operation *op,
                           const FrozenRewritePatternSet &patterns,
diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index 6a12fe3acc2c..e1d7498f7be3 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -295,9 +295,45 @@ RunLivenessAnalysis::RunLivenessAnalysis(Operation *op) {
 
   loadBaselineAnalyses(solver);
   solver.load<LivenessAnalysis>(symbolTable);
-  LDBG("Initializing and running solver");
+  LLVM_DEBUG({ llvm::dbgs() << "Initializing and running solver\n"; });
   (void)solver.initializeAndRun(op);
-  LDBG("Dumping liveness state for op");
+  LLVM_DEBUG({
+    llvm::dbgs() << "RunLivenessAnalysis initialized for op: " << op->getName()
+                 << " check on unreachable code now:"
+                 << "\n";
+  });
+  // The framework doesn't visit operations in dead blocks, so we need to
+  // explicitly mark them as dead.
+  op->walk([&](Operation *op) {
+    if (op->getNumResults() == 0)
+      return;
+    for (auto result : llvm::enumerate(op->getResults())) {
+      if (getLiveness(result.value()))
+        continue;
+      LLVM_DEBUG({
+        llvm::dbgs() << "Result: " << result.index() << " of "
+                     << OpWithFlags(op, OpPrintingFlags().skipRegions())
+                     << " has no liveness info (unreachable), mark dead"
+                     << "\n";
+      });
+      solver.getOrCreateState<Liveness>(result.value());
+    }
+    for (auto &region : op->getRegions()) {
+      for (auto &block : region) {
+        for (auto blockArg : llvm::enumerate(block.getArguments())) {
+          if (getLiveness(blockArg.value()))
+            continue;
+          LLVM_DEBUG({
+            llvm::dbgs() << "Block argument: " << blockArg.index() << " of "
+                         << OpWithFlags(op, OpPrintingFlags().skipRegions())
+                         << " has no liveness info, mark dead"
+                         << "\n";
+          });
+          solver.getOrCreateState<Liveness>(blockArg.value());
+        }
+      }
+    }
+  });
 }
 
 const Liveness *RunLivenessAnalysis::getLiveness(Value val) {
diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
index e625f626d12f..5e342fd87773 100644
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -19,12 +19,15 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
 #include <cassert>
 #include <optional>
 
 using namespace mlir;
 using namespace mlir::dataflow;
 
+#define DEBUG_TYPE "dataflow"
+
 //===----------------------------------------------------------------------===//
 // AbstractSparseLattice
 //===----------------------------------------------------------------------===//
@@ -64,22 +67,56 @@ AbstractSparseForwardDataFlowAnalysis::initialize(Operation *top) {
 
 LogicalResult
 AbstractSparseForwardDataFlowAnalysis::initializeRecursively(Operation *op) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Initializing recursively for operation: " << op->getName()
+                 << "\n";
+  });
+
   // Initialize the analysis by visiting every owner of an SSA value (all
   // operations and blocks).
-  if (failed(visitOperation(op)))
+  if (failed(visitOperation(op))) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Failed to visit operation: " << op->getName() << "\n";
+    });
     return failure();
+  }
 
   for (Region &region : op->getRegions()) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing region with " << region.getBlocks().size()
+                   << " blocks"
+                   << "\n";
+    });
     for (Block &block : region) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "Processing block with " << block.getNumArguments()
+                     << " arguments"
+                     << "\n";
+      });
       getOrCreate<Executable>(getProgramPointBefore(&block))
           ->blockContentSubscribe(this);
       visitBlock(&block);
-      for (Operation &op : block)
-        if (failed(initializeRecursively(&op)))
+      for (Operation &op : block) {
+        LLVM_DEBUG({
+          llvm::dbgs() << "Recursively initializing nested operation: "
+                       << op.getName() << "\n";
+        });
+        if (failed(initializeRecursively(&op))) {
+          LLVM_DEBUG({
+            llvm::dbgs() << "Failed to initialize nested operation: "
+                         << op.getName() << "\n";
+          });
           return failure();
+        }
+      }
     }
   }
 
+  LLVM_DEBUG({
+    llvm::dbgs()
+        << "Successfully completed recursive initialization for operation: "
+        << op->getName() << "\n";
+  });
   return success();
 }
 
@@ -409,11 +446,29 @@ static MutableArrayRef<OpOperand> operandsToOpOperands(OperandRange &operands) {
 
 LogicalResult
 AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
+  LLVM_DEBUG({
+    llvm::dbgs() << "Visiting operation: " << op->getName() << " with "
+                 << op->getNumOperands() << " operands and "
+                 << op->getNumResults() << " results"
+                 << "\n";
+  });
+
   // If we're in a dead block, bail out.
   if (op->getBlock() != nullptr &&
-      !getOrCreate<Executable>(getProgramPointBefore(op->getBlock()))->isLive())
+      !getOrCreate<Executable>(getProgramPointBefore(op->getBlock()))
+           ->isLive()) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Operation is in dead block, bailing out"
+                   << "\n";
+    });
     return success();
+  }
 
+  LLVM_DEBUG({
+    llvm::dbgs() << "Creating lattice elements for " << op->getNumOperands()
+                 << " operands and " << op->getNumResults() << " results"
+                 << "\n";
+  });
   SmallVector<AbstractSparseLattice *> operandLattices =
       getLatticeElements(op->getOperands());
   SmallVector<const AbstractSparseLattice *> resultLattices =
@@ -422,11 +477,21 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   // Block arguments of region branch operations flow back into the operands
   // of the parent op
   if (auto branch = dyn_cast<RegionBranchOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing RegionBranchOpInterface operation"
+                   << "\n";
+    });
     visitRegionSuccessors(branch, operandLattices);
     return success();
   }
 
   if (auto branch = dyn_cast<BranchOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing BranchOpInterface operation with "
+                   << op->getNumSuccessors() << " successors"
+                   << "\n";
+    });
+
     // Block arguments of successor blocks flow back into our operands.
 
     // We remember all operands not forwarded to any block in a BitVector.
@@ -463,6 +528,10 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   // For function calls, connect the arguments of the entry blocks to the
   // operands of the call op that are forwarded to these arguments.
   if (auto call = dyn_cast<CallOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing CallOpInterface operation"
+                   << "\n";
+    });
     Operation *callableOp = call.resolveCallableInTable(&symbolTable);
     if (auto callable = dyn_cast_or_null<CallableOpInterface>(callableOp)) {
       // Not all operands of a call op forward to arguments. Such operands are
@@ -513,6 +582,10 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   // of this op itself and the operands of the terminators of the regions of
   // this op.
   if (auto terminator = dyn_cast<RegionBranchTerminatorOpInterface>(op)) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing RegionBranchTerminatorOpInterface operation"
+                   << "\n";
+    });
     if (auto branch = dyn_cast<RegionBranchOpInterface>(op->getParentOp())) {
       visitRegionSuccessorsFromTerminator(terminator, branch);
       return success();
@@ -520,12 +593,25 @@ AbstractSparseBackwardDataFlowAnalysis::visitOperation(Operation *op) {
   }
 
   if (op->hasTrait<OpTrait::ReturnLike>()) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Processing ReturnLike operation"
+                   << "\n";
+    });
     // Going backwards, the operands of the return are derived from the
     // results of all CallOps calling this CallableOp.
-    if (auto callable = dyn_cast<CallableOpInterface>(op->getParentOp()))
+    if (auto callable = dyn_cast<CallableOpInterface>(op->getParentOp())) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "Callable parent found, visiting callable operation"
+                     << "\n";
+      });
       return visitCallableOperation(op, callable, operandLattices);
+    }
   }
 
+  LLVM_DEBUG({
+    llvm::dbgs() << "Using default visitOperationImpl for operation: "
+                 << op->getName() << "\n";
+  });
   return visitOperationImpl(op, operandLattices, resultLattices);
 }
 
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 36a9812bd797..991c71e3f689 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -26,7 +26,8 @@
 using namespace mlir;
 
 static void
-getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
+getForwardSliceImpl(Operation *op, DenseSet<Operation *> &visited,
+                    SetVector<Operation *> *forwardSlice,
                     const SliceOptions::TransitiveFilter &filter = nullptr) {
   if (!op)
     return;
@@ -40,20 +41,41 @@ getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
   for (Region &region : op->getRegions())
     for (Block &block : region)
       for (Operation &blockOp : block)
-        if (forwardSlice->count(&blockOp) == 0)
-          getForwardSliceImpl(&blockOp, forwardSlice, filter);
-  for (Value result : op->getResults()) {
-    for (Operation *userOp : result.getUsers())
-      if (forwardSlice->count(userOp) == 0)
-        getForwardSliceImpl(userOp, forwardSlice, filter);
-  }
+        if (forwardSlice->count(&blockOp) == 0) {
+          // We don't have to check if the 'blockOp' is already visited because
+          // there cannot be a traversal path from this nested op to the parent
+          // and thus a cycle cannot be closed here. We still have to mark it
+          // as visited to stop before visiting this operation again if it is
+          // part of a cycle.
+          visited.insert(&blockOp);
+          getForwardSliceImpl(&blockOp, visited, forwardSlice, filter);
+          visited.erase(&blockOp);
+        }
+
+  for (Value result : op->getResults())
+    for (Operation *userOp : result.getUsers()) {
+      // A cycle can only occur within a basic block (not across regions or
+      // basic blocks) because the parent region must be a graph region, graph
+      // regions are restricted to always have 0 or 1 blocks, and there cannot
+      // be a def-use edge from a nested operation to an operation in an
+      // ancestor region. Therefore, we don't have to but may use the same
+      // 'visited' set across regions/blocks as long as we remove operations
+      // from the set again when the DFS traverses back from the leaf to the
+      // root.
+      if (forwardSlice->count(userOp) == 0 && visited.insert(userOp).second)
+        getForwardSliceImpl(userOp, visited, forwardSlice, filter);
+
+      visited.erase(userOp);
+    }
 
   forwardSlice->insert(op);
 }
 
 void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
                            const ForwardSliceOptions &options) {
-  getForwardSliceImpl(op, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  getForwardSliceImpl(op, visited, forwardSlice, options.filter);
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
     // want it in the results.
@@ -69,8 +91,12 @@ void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
 
 void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
                            const SliceOptions &options) {
-  for (Operation *user : root.getUsers())
-    getForwardSliceImpl(user, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  for (Operation *user : root.getUsers()) {
+    visited.insert(user);
+    getForwardSliceImpl(user, visited, forwardSlice, options.filter);
+    visited.erase(user);
+  }
 
   // Reverse to get back the actual topological order.
   // std::reverse does not work out of the box on SetVector and I want an
@@ -80,6 +106,7 @@ void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 }
 
 static LogicalResult getBackwardSliceImpl(Operation *op,
+                                          DenseSet<Operation *> &visited,
                                           SetVector<Operation *> *backwardSlice,
                                           const BackwardSliceOptions &options) {
   if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
@@ -93,8 +120,12 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 
   auto processValue = [&](Value value) {
     if (auto *definingOp = value.getDefiningOp()) {
-      if (backwardSlice->count(definingOp) == 0)
-        return getBackwardSliceImpl(definingOp, backwardSlice, options);
+      if (backwardSlice->count(definingOp) == 0 &&
+          visited.insert(definingOp).second)
+        return getBackwardSliceImpl(definingOp, visited, backwardSlice,
+                                    options);
+
+      visited.erase(definingOp);
     } else if (auto blockArg = dyn_cast<BlockArgument>(value)) {
       if (options.omitBlockArguments)
         return success();
@@ -107,7 +138,8 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
       if (parentOp && backwardSlice->count(parentOp) == 0) {
         if (parentOp->getNumRegions() == 1 &&
             llvm::hasSingleElement(parentOp->getRegion(0).getBlocks())) {
-          return getBackwardSliceImpl(parentOp, backwardSlice, options);
+          return getBackwardSliceImpl(parentOp, visited, backwardSlice,
+                                      options);
         }
       }
     } else {
@@ -145,7 +177,10 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 LogicalResult mlir::getBackwardSlice(Operation *op,
                                      SetVector<Operation *> *backwardSlice,
                                      const BackwardSliceOptions &options) {
-  LogicalResult result = getBackwardSliceImpl(op, backwardSlice, options);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  LogicalResult result =
+      getBackwardSliceImpl(op, visited, backwardSlice, options);
 
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
diff --git a/mlir/lib/AsmParser/DialectSymbolParser.cpp b/mlir/lib/AsmParser/DialectSymbolParser.cpp
index 9f4a87a6a02d..416d8eb5f40e 100644
--- a/mlir/lib/AsmParser/DialectSymbolParser.cpp
+++ b/mlir/lib/AsmParser/DialectSymbolParser.cpp
@@ -89,6 +89,7 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
     nestedPunctuation.pop_back();
     return success();
   };
+  const char *curBufferEnd = state.lex.getBufferEnd();
   do {
     // Handle code completions, which may appear in the middle of the symbol
     // body.
@@ -98,6 +99,12 @@ ParseResult Parser::parseDialectSymbolBody(StringRef &body,
       break;
     }
 
+    if (curBufferEnd == curPtr) {
+      if (!nestedPunctuation.empty())
+        return emitPunctError();
+      return emitError("unexpected nul or EOF in pretty dialect name");
+    }
+
     char c = *curPtr++;
     switch (c) {
     case '\0':
@@ -238,6 +245,15 @@ static Symbol parseExtendedSymbol(Parser &p, AsmParserState *asmState,
       return nullptr;
   }
 
+  if constexpr (std::is_same_v<Symbol, Attribute>) {
+    auto &cache = p.getState().symbols.attributesCache;
+    auto cacheIt = cache.find(symbolData);
+    // Skip cached attribute if it has type.
+    if (cacheIt != cache.end() && !p.getToken().is(Token::colon))
+      return cacheIt->second;
+
+    return cache[symbolData] = createSymbol(dialectName, symbolData, loc);
+  }
   return createSymbol(dialectName, symbolData, loc);
 }
 
@@ -330,6 +346,7 @@ Type Parser::parseExtendedType() {
 template <typename T, typename ParserFn>
 static T parseSymbol(StringRef inputStr, MLIRContext *context,
                      size_t *numReadOut, bool isKnownNullTerminated,
+                     llvm::StringMap<Attribute> *attributesCache,
                      ParserFn &&parserFn) {
   // Set the buffer name to the string being parsed, so that it appears in error
   // diagnostics.
@@ -341,6 +358,9 @@ static T parseSymbol(StringRef inputStr, MLIRContext *context,
   SourceMgr sourceMgr;
   sourceMgr.AddNewSourceBuffer(std::move(memBuffer), SMLoc());
   SymbolState aliasState;
+  if (attributesCache)
+    aliasState.attributesCache = *attributesCache;
+
   ParserConfig config(context);
   ParserState state(sourceMgr, config, aliasState, /*asmState=*/nullptr,
                     /*codeCompleteContext=*/nullptr);
@@ -351,6 +371,11 @@ static T parseSymbol(StringRef inputStr, MLIRContext *context,
   if (!symbol)
     return T();
 
+  if constexpr (std::is_same_v<T, Attribute>) {
+    if (attributesCache)
+      *attributesCache = state.symbols.attributesCache;
+  }
+
   // Provide the number of bytes that were read.
   Token endTok = parser.getToken();
   size_t numRead =
@@ -367,13 +392,15 @@ static T parseSymbol(StringRef inputStr, MLIRContext *context,
 
 Attribute mlir::parseAttribute(StringRef attrStr, MLIRContext *context,
                                Type type, size_t *numRead,
-                               bool isKnownNullTerminated) {
+                               bool isKnownNullTerminated,
+                               llvm::StringMap<Attribute> *attributesCache) {
   return parseSymbol<Attribute>(
-      attrStr, context, numRead, isKnownNullTerminated,
+      attrStr, context, numRead, isKnownNullTerminated, attributesCache,
       [type](Parser &parser) { return parser.parseAttribute(type); });
 }
 Type mlir::parseType(StringRef typeStr, MLIRContext *context, size_t *numRead,
                      bool isKnownNullTerminated) {
   return parseSymbol<Type>(typeStr, context, numRead, isKnownNullTerminated,
+                           /*attributesCache=*/nullptr,
                            [](Parser &parser) { return parser.parseType(); });
 }
diff --git a/mlir/lib/AsmParser/Lexer.cpp b/mlir/lib/AsmParser/Lexer.cpp
index 751bd63e537f..8f53529823e2 100644
--- a/mlir/lib/AsmParser/Lexer.cpp
+++ b/mlir/lib/AsmParser/Lexer.cpp
@@ -37,6 +37,18 @@ Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context,
              AsmParserCodeCompleteContext *codeCompleteContext)
     : sourceMgr(sourceMgr), context(context), codeCompleteLoc(nullptr) {
   auto bufferID = sourceMgr.getMainFileID();
+
+  // Check to see if the main buffer contains the last buffer, and if so the
+  // last buffer should be used as main file for parsing.
+  if (sourceMgr.getNumBuffers() > 1) {
+    unsigned lastFileID = sourceMgr.getNumBuffers();
+    const llvm::MemoryBuffer *main = sourceMgr.getMemoryBuffer(bufferID);
+    const llvm::MemoryBuffer *last = sourceMgr.getMemoryBuffer(lastFileID);
+    if (main->getBufferStart() <= last->getBufferStart() &&
+        main->getBufferEnd() >= last->getBufferEnd()) {
+      bufferID = lastFileID;
+    }
+  }
   curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
   curPtr = curBuffer.begin();
 
@@ -71,6 +83,7 @@ Token Lexer::emitError(const char *loc, const Twine &message) {
 }
 
 Token Lexer::lexToken() {
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     const char *tokStart = curPtr;
 
@@ -78,6 +91,9 @@ Token Lexer::lexToken() {
     if (tokStart == codeCompleteLoc)
       return formToken(Token::code_complete, tokStart);
 
+    if (tokStart == curBufferEnd)
+      return formToken(Token::eof, tokStart);
+
     // Lex the next token.
     switch (*curPtr++) {
     default:
@@ -102,7 +118,7 @@ Token Lexer::lexToken() {
     case 0:
       // This may either be a nul character in the source file or may be the EOF
       // marker that llvm::MemoryBuffer guarantees will be there.
-      if (curPtr - 1 == curBuffer.end())
+      if (curPtr - 1 == curBufferEnd)
         return formToken(Token::eof, tokStart);
       continue;
 
@@ -259,7 +275,11 @@ void Lexer::skipComment() {
   assert(*curPtr == '/');
   ++curPtr;
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
+    if (curPtr == curBufferEnd)
+      return;
+
     switch (*curPtr++) {
     case '\n':
     case '\r':
@@ -267,7 +287,7 @@ void Lexer::skipComment() {
       return;
     case 0:
       // If this is the end of the buffer, end the comment.
-      if (curPtr - 1 == curBuffer.end()) {
+      if (curPtr - 1 == curBufferEnd) {
         --curPtr;
         return;
       }
@@ -405,6 +425,7 @@ Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
 Token Lexer::lexString(const char *tokStart) {
   assert(curPtr[-1] == '"');
 
+  const char *curBufferEnd = curBuffer.end();
   while (true) {
     // Check to see if there is a code completion location within the string. In
     // these cases we generate a completion location and place the currently
@@ -419,7 +440,7 @@ Token Lexer::lexString(const char *tokStart) {
     case 0:
       // If this is a random nul character in the middle of a string, just
       // include it.  If it is the end of file, then it is an error.
-      if (curPtr - 1 != curBuffer.end())
+      if (curPtr - 1 != curBufferEnd)
         continue;
       [[fallthrough]];
     case '\n':
diff --git a/mlir/lib/AsmParser/Lexer.h b/mlir/lib/AsmParser/Lexer.h
index 4085a9b73854..670444eb1f5b 100644
--- a/mlir/lib/AsmParser/Lexer.h
+++ b/mlir/lib/AsmParser/Lexer.h
@@ -40,6 +40,9 @@ class Lexer {
   /// Returns the start of the buffer.
   const char *getBufferBegin() { return curBuffer.data(); }
 
+  /// Returns the end of the buffer.
+  const char *getBufferEnd() { return curBuffer.end(); }
+
   /// Return the code completion location of the lexer, or nullptr if there is
   /// none.
   const char *getCodeCompleteLoc() const { return codeCompleteLoc; }
diff --git a/mlir/lib/AsmParser/ParserState.h b/mlir/lib/AsmParser/ParserState.h
index 159058a18fa4..aa53032107cb 100644
--- a/mlir/lib/AsmParser/ParserState.h
+++ b/mlir/lib/AsmParser/ParserState.h
@@ -40,6 +40,9 @@ struct SymbolState {
 
   /// A map from unique integer identifier to DistinctAttr.
   DenseMap<uint64_t, DistinctAttr> distinctAttributes;
+
+  /// A map from unique string identifier to Attribute.
+  llvm::StringMap<Attribute> attributesCache;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index 44458d010c6c..0f9744343377 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -895,6 +895,10 @@ class AttrTypeReader {
   SmallVector<AttrEntry> attributes;
   SmallVector<TypeEntry> types;
 
+  /// The map of cached attributes, used to avoid re-parsing the same
+  /// attribute multiple times.
+  llvm::StringMap<Attribute> attributesCache;
+
   /// A location used for error emission.
   Location fileLoc;
 
@@ -1235,7 +1239,7 @@ LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
         ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
   else
     result = ::parseAttribute(asmStr, context, Type(), &numRead,
-                              /*isKnownNullTerminated=*/true);
+                              /*isKnownNullTerminated=*/true, &attributesCache);
   if (!result)
     return failure();
 
diff --git a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
index 0df91a243d07..523dc463a0da 100644
--- a/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
+++ b/mlir/lib/Conversion/SCFToControlFlow/SCFToControlFlow.cpp
@@ -582,6 +582,7 @@ LogicalResult WhileLowering::matchAndRewrite(WhileOp whileOp,
   // block. This should be reconsidered if we allow break/continue in SCF.
   rewriter.setInsertionPointToEnd(before);
   auto condOp = cast<ConditionOp>(before->getTerminator());
+  SmallVector<Value> args = llvm::to_vector(condOp.getArgs());
   rewriter.replaceOpWithNewOp<cf::CondBranchOp>(condOp, condOp.getCondition(),
                                                 after, condOp.getArgs(),
                                                 continuation, ValueRange());
@@ -593,7 +594,7 @@ LogicalResult WhileLowering::matchAndRewrite(WhileOp whileOp,
 
   // Replace the op with values "yielded" from the "before" region, which are
   // visible by dominance.
-  rewriter.replaceOp(whileOp, condOp.getArgs());
+  rewriter.replaceOp(whileOp, args);
 
   return success();
 }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 00c31a1500e1..dbea050b554e 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -4273,14 +4273,15 @@ LogicalResult scf::IndexSwitchOp::verify() {
              << "see yield operation here";
     }
     for (auto [idx, result, operand] :
-         llvm::zip(llvm::seq<unsigned>(0, getNumResults()), getResultTypes(),
-                   yield.getOperandTypes())) {
-      if (result == operand)
+         llvm::enumerate(getResultTypes(), yield.getOperands())) {
+      if (!operand)
+        return yield.emitOpError() << "operand " << idx << " is null\n";
+      if (result == operand.getType())
         continue;
       return (emitOpError("expected result #")
               << idx << " of each region to be " << result)
                  .attachNote(yield.getLoc())
-             << name << " returns " << operand << " here";
+             << name << " returns " << operand.getType() << " here";
     }
     return success();
   };
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 1cded38c4419..36059e553d30 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -272,7 +272,7 @@ std::optional<int64_t> constantTripCount(OpFoldResult lb, OpFoldResult ub,
   if (!ubConstant)
     return std::nullopt;
   std::optional<int64_t> stepConstant = getConstantIntValue(step);
-  if (!stepConstant)
+  if (!stepConstant || *stepConstant == 0)
     return std::nullopt;
 
   return llvm::divideCeilSigned(*ubConstant - *lbConstant, *stepConstant);
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 8a08a157b25d..7d615bfc1298 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4237,28 +4237,35 @@ class StridedSliceBroadcast final
     auto dstVecType = llvm::cast<VectorType>(op.getType());
     unsigned dstRank = dstVecType.getRank();
     unsigned rankDiff = dstRank - srcRank;
-    // Check if the most inner dimensions of the source of the broadcast are the
-    // same as the destination of the extract. If this is the case we can just
-    // use a broadcast as the original dimensions are untouched.
-    bool lowerDimMatch = true;
+    // Source dimensions can be broadcasted (1 -> n with n > 1) or sliced
+    // (n -> m with n > m). If they are originally both broadcasted *and*
+    // sliced, this can be simplified to just broadcasting.
+    bool needsSlice = false;
     for (unsigned i = 0; i < srcRank; i++) {
-      if (srcVecType.getDimSize(i) != dstVecType.getDimSize(i + rankDiff)) {
-        lowerDimMatch = false;
+      if (srcVecType.getDimSize(i) != 1 &&
+          srcVecType.getDimSize(i) != dstVecType.getDimSize(i + rankDiff)) {
+        needsSlice = true;
         break;
       }
     }
     Value source = broadcast.getSource();
-    // If the inner dimensions don't match, it means we need to extract from the
-    // source of the orignal broadcast and then broadcast the extracted value.
-    // We also need to handle degenerated cases where the source is effectively
-    // just a single scalar.
-    bool isScalarSrc = (srcRank == 0 || srcVecType.getNumElements() == 1);
-    if (!lowerDimMatch && !isScalarSrc) {
+    if (needsSlice) {
+      SmallVector<int64_t> offsets =
+          getI64SubArray(op.getOffsets(), /*dropFront=*/rankDiff);
+      SmallVector<int64_t> sizes =
+          getI64SubArray(op.getSizes(), /*dropFront=*/rankDiff);
+      for (unsigned i = 0; i < srcRank; i++) {
+        if (srcVecType.getDimSize(i) == 1) {
+          // In case this dimension was broadcasted *and* sliced, the offset
+          // and size need to be updated now that there is no broadcast before
+          // the slice.
+          offsets[i] = 0;
+          sizes[i] = 1;
+        }
+      }
       source = rewriter.create<ExtractStridedSliceOp>(
-          op->getLoc(), source,
-          getI64SubArray(op.getOffsets(), /* dropFront=*/rankDiff),
-          getI64SubArray(op.getSizes(), /* dropFront=*/rankDiff),
-          getI64SubArray(op.getStrides(), /* dropFront=*/rankDiff));
+          op->getLoc(), source, offsets, sizes,
+          getI64SubArray(op.getStrides(), /*dropFront=*/rankDiff));
     }
     rewriter.replaceOpWithNewOp<BroadcastOp>(op, op.getType(), source);
     return success();
diff --git a/mlir/lib/IR/AttributeDetail.h b/mlir/lib/IR/AttributeDetail.h
index 26d40ac3a38f..cb9d21bf3e61 100644
--- a/mlir/lib/IR/AttributeDetail.h
+++ b/mlir/lib/IR/AttributeDetail.h
@@ -19,11 +19,9 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/Support/StorageUniquer.h"
-#include "mlir/Support/ThreadLocalCache.h"
 #include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/Allocator.h"
+#include <mutex>
 
 namespace mlir {
 namespace detail {
@@ -396,27 +394,30 @@ class DistinctAttributeUniquer {
                                               Attribute referencedAttr);
 };
 
-/// An allocator for distinct attribute storage instances. It uses thread local
-/// bump pointer allocators stored in a thread local cache to ensure the storage
-/// is freed after the destruction of the distinct attribute allocator.
-class DistinctAttributeAllocator {
+/// An allocator for distinct attribute storage instances. Uses a synchronized
+/// BumpPtrAllocator to ensure thread-safety. The allocated storage is deleted
+/// when the DistinctAttributeAllocator is destroyed.
+class DistinctAttributeAllocator final {
 public:
   DistinctAttributeAllocator() = default;
-
   DistinctAttributeAllocator(DistinctAttributeAllocator &&) = delete;
   DistinctAttributeAllocator(const DistinctAttributeAllocator &) = delete;
   DistinctAttributeAllocator &
   operator=(const DistinctAttributeAllocator &) = delete;
 
-  /// Allocates a distinct attribute storage using a thread local bump pointer
-  /// allocator to enable synchronization free parallel allocations.
   DistinctAttrStorage *allocate(Attribute referencedAttr) {
-    return new (allocatorCache.get().Allocate<DistinctAttrStorage>())
+    std::scoped_lock<std::mutex> guard(allocatorMutex);
+    return new (allocator.Allocate<DistinctAttrStorage>())
         DistinctAttrStorage(referencedAttr);
-  }
+  };
 
 private:
-  ThreadLocalCache<llvm::BumpPtrAllocator> allocatorCache;
+  /// Used to allocate distict attribute storages. The managed memory is freed
+  /// automatically when the allocator instance is destroyed.
+  llvm::BumpPtrAllocator allocator;
+
+  /// Used to lock access to the allocator.
+  std::mutex allocatorMutex;
 };
 } // namespace detail
 } // namespace mlir
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index 716d9c85a377..2d5381d43f86 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/RWMutex.h"
 #include "llvm/Support/ThreadPool.h"
@@ -883,6 +884,8 @@ int OperationName::UnregisteredOpModel::getOpPropertyByteSize() {
 void OperationName::UnregisteredOpModel::initProperties(
     OperationName opName, OpaqueProperties storage, OpaqueProperties init) {
   new (storage.as<Attribute *>()) Attribute();
+  if (init)
+    *storage.as<Attribute *>() = *init.as<Attribute *>();
 }
 void OperationName::UnregisteredOpModel::deleteProperties(
     OpaqueProperties prop) {
diff --git a/mlir/lib/Pass/PassCrashRecovery.cpp b/mlir/lib/Pass/PassCrashRecovery.cpp
index b048ff946239..c6fb1d737d50 100644
--- a/mlir/lib/Pass/PassCrashRecovery.cpp
+++ b/mlir/lib/Pass/PassCrashRecovery.cpp
@@ -414,14 +414,19 @@ struct FileReproducerStream : public mlir::ReproducerStream {
 
 LogicalResult PassManager::runWithCrashRecovery(Operation *op,
                                                 AnalysisManager am) {
+  const bool threadingEnabled = getContext()->isMultithreadingEnabled();
   crashReproGenerator->initialize(getPasses(), op, verifyPasses);
 
   // Safely invoke the passes within a recovery context.
   LogicalResult passManagerResult = failure();
   llvm::CrashRecoveryContext recoveryContext;
-  recoveryContext.RunSafelyOnThread(
-      [&] { passManagerResult = runPasses(op, am); });
+  const auto runPassesFn = [&] { passManagerResult = runPasses(op, am); };
+  if (threadingEnabled)
+    recoveryContext.RunSafelyOnThread(runPassesFn);
+  else
+    recoveryContext.RunSafely(runPassesFn);
   crashReproGenerator->finalize(op, passManagerResult);
+
   return passManagerResult;
 }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3185f28fe668..f8ea6ee07447 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3395,6 +3395,7 @@ static llvm::omp::Directive convertCancellationConstructType(
   case omp::ClauseCancellationConstructType::Taskgroup:
     return llvm::omp::Directive::OMPD_taskgroup;
   }
+  llvm_unreachable("Unhandled cancellation construct type");
 }
 
 static LogicalResult
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 31e0caa76811..9f2a5c761b5c 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -508,8 +508,7 @@ performActions(raw_ostream &os,
            << "bytecode version while not emitting bytecode";
   AsmState asmState(op.get(), OpPrintingFlags(), /*locationMap=*/nullptr,
                     &fallbackResourceMap);
-  op.get()->print(os, asmState);
-  os << '\n';
+  os << OpWithState(op.get(), asmState) << '\n';
   return success();
 }
 
diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 608bdcb94817..1d7e2135e23e 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -258,16 +258,22 @@ static SmallVector<OpOperand *> operandsToOpOperands(OperandRange operands) {
 static void processSimpleOp(Operation *op, RunLivenessAnalysis &la,
                             DenseSet<Value> &nonLiveSet,
                             RDVFinalCleanupList &cl) {
-  LDBG("Processing simple op: " << *op);
   if (!isMemoryEffectFree(op) || hasLive(op->getResults(), nonLiveSet, la)) {
-    LDBG("Simple op is not memory effect free or has live results, skipping: "
-         << *op);
+    LLVM_DEBUG({
+      llvm::dbgs()
+          << "Simple op is not memory effect free or has live results, "
+             "preserving it: "
+          << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+    });
     return;
   }
 
-  LDBG("Simple op has all dead results and is memory effect free, scheduling "
-       "for removal: "
-       << *op);
+  LLVM_DEBUG({
+    llvm::dbgs() << "Simple op has all dead results and is memory effect free, "
+                    "scheduling "
+                    "for removal: "
+                 << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+  });
   cl.operations.push_back(op);
   collectNonLiveValues(nonLiveSet, op->getResults(),
                        BitVector(op->getNumResults(), true));
@@ -345,8 +351,6 @@ static void processFuncOp(FunctionOpInterface funcOp, Operation *module,
   // since it forwards only to non-live value(s) (%1#1).
   Operation *lastReturnOp = funcOp.back().getTerminator();
   size_t numReturns = lastReturnOp->getNumOperands();
-  if (numReturns == 0)
-    return;
   BitVector nonLiveRets(numReturns, true);
   for (SymbolTable::SymbolUse use : uses) {
     Operation *callOp = use.getUser();
@@ -368,6 +372,8 @@ static void processFuncOp(FunctionOpInterface funcOp, Operation *module,
   cl.functions.push_back({funcOp, nonLiveArgs, nonLiveRets});
 
   // Do (5) and (6).
+  if (numReturns == 0)
+    return;
   for (SymbolTable::SymbolUse use : uses) {
     Operation *callOp = use.getUser();
     assert(isa<CallOpInterface>(callOp) && "expected a call-like user");
@@ -727,19 +733,53 @@ static void processBranchOp(BranchOpInterface branchOp, RunLivenessAnalysis &la,
 /// Removes dead values collected in RDVFinalCleanupList.
 /// To be run once when all dead values have been collected.
 static void cleanUpDeadVals(RDVFinalCleanupList &list) {
+  LLVM_DEBUG({ llvm::dbgs() << "Starting cleanup of dead values...\n"; });
+
   // 1. Operations
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.operations.size() << " operations"
+                 << "\n";
+  });
   for (auto &op : list.operations) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing operation: "
+                   << OpWithFlags(op, OpPrintingFlags().skipRegions()) << "\n";
+    });
     op->dropAllUses();
     op->erase();
   }
 
   // 2. Values
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.values.size() << " values"
+                 << "\n";
+  });
   for (auto &v : list.values) {
+    LLVM_DEBUG(
+        { llvm::dbgs() << "Dropping all uses of value: " << v << "\n"; });
     v.dropAllUses();
   }
 
   // 3. Functions
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.functions.size() << " functions"
+                 << "\n";
+  });
   for (auto &f : list.functions) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Cleaning up function: "
+                   << f.funcOp.getOperation()->getName() << "\n";
+    });
+    LLVM_DEBUG({
+      llvm::dbgs() << "  Erasing " << f.nonLiveArgs.count()
+                   << " non-live arguments"
+                   << "\n";
+    });
+    LLVM_DEBUG({
+      llvm::dbgs() << "  Erasing " << f.nonLiveRets.count()
+                   << " non-live return values"
+                   << "\n";
+    });
     // Some functions may not allow erasing arguments or results. These calls
     // return failure in such cases without modifying the function, so it's okay
     // to proceed.
@@ -748,44 +788,99 @@ static void cleanUpDeadVals(RDVFinalCleanupList &list) {
   }
 
   // 4. Operands
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.operands.size() << " operand lists"
+                 << "\n";
+  });
   for (OperationToCleanup &o : list.operands) {
-    if (o.op->getNumOperands() > 0)
+    if (o.op->getNumOperands() > 0) {
+      LLVM_DEBUG({
+        llvm::dbgs() << "Erasing " << o.nonLive.count()
+                     << " non-live operands from operation: "
+                     << OpWithFlags(o.op, OpPrintingFlags().skipRegions())
+                     << "\n";
+      });
       o.op->eraseOperands(o.nonLive);
+    }
   }
 
   // 5. Results
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.results.size() << " result lists"
+                 << "\n";
+  });
   for (auto &r : list.results) {
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing " << r.nonLive.count()
+                   << " non-live results from operation: "
+                   << OpWithFlags(r.op, OpPrintingFlags().skipRegions())
+                   << "\n";
+    });
     dropUsesAndEraseResults(r.op, r.nonLive);
   }
 
   // 6. Blocks
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.blocks.size()
+                 << " block argument lists"
+                 << "\n";
+  });
   for (auto &b : list.blocks) {
     // blocks that are accessed via multiple codepaths processed once
     if (b.b->getNumArguments() != b.nonLiveArgs.size())
       continue;
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing " << b.nonLiveArgs.count()
+                   << " non-live arguments from block: " << b.b << "\n";
+    });
     // it iterates backwards because erase invalidates all successor indexes
     for (int i = b.nonLiveArgs.size() - 1; i >= 0; --i) {
       if (!b.nonLiveArgs[i])
         continue;
+      LLVM_DEBUG({
+        llvm::dbgs() << "  Erasing block argument " << i << ": "
+                     << b.b->getArgument(i) << "\n";
+      });
       b.b->getArgument(i).dropAllUses();
       b.b->eraseArgument(i);
     }
   }
 
   // 7. Successor Operands
+  LLVM_DEBUG({
+    llvm::dbgs() << "Cleaning up " << list.successorOperands.size()
+                 << " successor operand lists"
+                 << "\n";
+  });
   for (auto &op : list.successorOperands) {
     SuccessorOperands successorOperands =
         op.branch.getSuccessorOperands(op.successorIndex);
     // blocks that are accessed via multiple codepaths processed once
     if (successorOperands.size() != op.nonLiveOperands.size())
       continue;
+    LLVM_DEBUG({
+      llvm::dbgs() << "Erasing " << op.nonLiveOperands.count()
+                   << " non-live successor operands from successor "
+                   << op.successorIndex << " of branch: "
+                   << OpWithFlags(op.branch, OpPrintingFlags().skipRegions())
+                   << "\n";
+    });
     // it iterates backwards because erase invalidates all successor indexes
     for (int i = successorOperands.size() - 1; i >= 0; --i) {
       if (!op.nonLiveOperands[i])
         continue;
+      LLVM_DEBUG({
+        llvm::dbgs() << "  Erasing successor operand " << i << ": "
+                     << successorOperands[i] << "\n";
+      });
       successorOperands.erase(i);
     }
   }
+
+  LLVM_DEBUG({
+    llvm::dbgs() << "Finished cleanup of dead values"
+                 << "\n";
+  });
 }
 
 struct RemoveDeadValues : public impl::RemoveDeadValuesBase<RemoveDeadValues> {
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index b82d85041394..0a2a0cc1d5c7 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
@@ -870,7 +871,18 @@ LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && {
 
     ctx->executeAction<GreedyPatternRewriteIteration>(
         [&] {
-          continueRewrites = processWorklist();
+          continueRewrites = false;
+
+          // Erase unreachable blocks
+          // Operations like:
+          //   %add = arith.addi %add, %add : i64
+          // are legal in unreachable code. Unfortunately many patterns would be
+          // unsafe to apply on such IR and can lead to crashes or infinite
+          // loops.
+          continueRewrites |=
+              succeeded(eraseUnreachableBlocks(rewriter, region));
+
+          continueRewrites |= processWorklist();
 
           // After applying patterns, make sure that the CFG of each of the
           // regions is kept up to date.
diff --git a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp
index ee5c642c943c..baa76b9aab4e 100644
--- a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp
@@ -13,11 +13,13 @@
 #include "mlir/Transforms/WalkPatternRewriteDriver.h"
 
 #include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Rewrite/PatternApplicator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -25,6 +27,26 @@
 
 namespace mlir {
 
+// Find all reachable blocks in the region and add them to the visitedBlocks
+// set.
+static void findReachableBlocks(Region &region,
+                                DenseSet<Block *> &reachableBlocks) {
+  Block *entryBlock = &region.front();
+  reachableBlocks.insert(entryBlock);
+  // Traverse the CFG and add all reachable blocks to the blockList.
+  SmallVector<Block *> worklist({entryBlock});
+  while (!worklist.empty()) {
+    Block *block = worklist.pop_back_val();
+    Operation *terminator = &block->back();
+    for (Block *successor : terminator->getSuccessors()) {
+      if (reachableBlocks.contains(successor))
+        continue;
+      worklist.push_back(successor);
+      reachableBlocks.insert(successor);
+    }
+  }
+}
+
 namespace {
 struct WalkAndApplyPatternsAction final
     : tracing::ActionImpl<WalkAndApplyPatternsAction> {
@@ -88,20 +110,112 @@ void walkAndApplyPatterns(Operation *op,
   PatternApplicator applicator(patterns);
   applicator.applyDefaultCostModel();
 
+  // Iterator on all reachable operations in the region.
+  // Also keep track if we visited the nested regions of the current op
+  // already to drive the post-order traversal.
+  struct RegionReachableOpIterator {
+    RegionReachableOpIterator(Region *region) : region(region) {
+      regionIt = region->begin();
+      if (regionIt != region->end())
+        blockIt = regionIt->begin();
+      if (!llvm::hasSingleElement(*region))
+        findReachableBlocks(*region, reachableBlocks);
+    }
+    // Advance the iterator to the next reachable operation.
+    void advance() {
+      assert(regionIt != region->end());
+      hasVisitedRegions = false;
+      if (blockIt == regionIt->end()) {
+        ++regionIt;
+        while (regionIt != region->end() &&
+               !reachableBlocks.contains(&*regionIt))
+          ++regionIt;
+        if (regionIt != region->end())
+          blockIt = regionIt->begin();
+        return;
+      }
+      ++blockIt;
+      if (blockIt != regionIt->end()) {
+        LLVM_DEBUG({
+          llvm::dbgs() << "Incrementing block iterator, next op: "
+                       << OpWithFlags(&*blockIt,
+                                      OpPrintingFlags().skipRegions())
+                       << "\n";
+        });
+      }
+    }
+    // The region we're iterating over.
+    Region *region;
+    // The Block currently being iterated over.
+    Region::iterator regionIt;
+    // The Operation currently being iterated over.
+    Block::iterator blockIt;
+    // The set of blocks that are reachable in the current region.
+    DenseSet<Block *> reachableBlocks;
+    // Whether we've visited the nested regions of the current op already.
+    bool hasVisitedRegions = false;
+  };
+
+  // Worklist of regions to visit to drive the post-order traversal.
+  SmallVector<RegionReachableOpIterator> worklist;
+
+  LLVM_DEBUG(
+      { llvm::dbgs() << "Starting walk-based pattern rewrite driver\n"; });
   ctx->executeAction<WalkAndApplyPatternsAction>(
       [&] {
+        // Perform a post-order traversal of the regions, visiting each
+        // reachable operation.
         for (Region &region : op->getRegions()) {
-          region.walk([&](Operation *visitedOp) {
-            LLVM_DEBUG(llvm::dbgs() << "Visiting op: "; visitedOp->print(
-                llvm::dbgs(), OpPrintingFlags().skipRegions());
-                       llvm::dbgs() << "\n";);
+          assert(worklist.empty());
+          if (region.empty())
+            continue;
+
+          // Prime the worklist with the entry block of this region.
+          worklist.push_back({&region});
+          while (!worklist.empty()) {
+            RegionReachableOpIterator &it = worklist.back();
+            if (it.regionIt == it.region->end()) {
+              // We're done with this region.
+              worklist.pop_back();
+              continue;
+            }
+            if (it.blockIt == it.regionIt->end()) {
+              // We're done with this block.
+              it.advance();
+              continue;
+            }
+            Operation *op = &*it.blockIt;
+            // If we haven't visited the nested regions of this op yet,
+            // enqueue them.
+            if (!it.hasVisitedRegions) {
+              it.hasVisitedRegions = true;
+              for (Region &nestedRegion : llvm::reverse(op->getRegions())) {
+                if (nestedRegion.empty())
+                  continue;
+                worklist.push_back({&nestedRegion});
+              }
+            }
+            // If we're not at the back of the worklist, we've enqueued some
+            // nested region for processing. We'll come back to this op later
+            // (post-order)
+            if (&it != &worklist.back())
+              continue;
+
+            // Preemptively increment the iterator, in case the current op
+            // would be erased.
+            it.advance();
+
+            LLVM_DEBUG({
+              llvm::dbgs() << "Visiting op: "
+                           << OpWithFlags(op, OpPrintingFlags().skipRegions())
+                           << "\n";
+            });
 #if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-            erasedListener.visitedOp = visitedOp;
+            erasedListener.visitedOp = op;
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
-            if (succeeded(applicator.matchAndRewrite(visitedOp, rewriter))) {
-              LLVM_DEBUG(llvm::dbgs() << "\tOp matched and rewritten\n";);
-            }
-          });
+            if (succeeded(applicator.matchAndRewrite(op, rewriter)))
+              LLVM_DEBUG({ llvm::dbgs() << "\tOp matched and rewritten\n"; });
+          }
         }
       },
       {op});
diff --git a/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir b/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir
index a89a0f4084e9..3748be74eb0f 100644
--- a/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir
+++ b/mlir/test/Analysis/DataFlow/test-liveness-analysis.mlir
@@ -283,3 +283,23 @@ func.func @test_10_negative() -> (i32) {
   %0:2 = func.call @private_1() : () -> (i32, i32)
   return %0#0 : i32
 }
+
+// -----
+
+// Test that we correctly set a liveness value for operations in dead block.
+// These won't be visited by the dataflow framework so the analysis need to
+// explicitly manage them.
+// CHECK-LABEL: test_tag: dead_block_cmpi:
+// CHECK-NEXT: operand #0: not live
+// CHECK-NEXT: operand #1: not live
+// CHECK-NEXT: result #0: not live
+func.func @dead_block() {
+  %false = arith.constant false
+  %zero = arith.constant 0 : i64
+  cf.cond_br %false, ^bb1, ^bb4
+  ^bb1:
+    %3 = arith.cmpi eq, %zero, %zero  {tag = "dead_block_cmpi"} : i64
+    cf.br ^bb1
+  ^bb4:
+    return
+}
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 0848a924b9d9..c53667a98cfb 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -292,3 +292,26 @@ func.func @slicing_test_multiple_return(%arg0: index) -> (index, index) {
   %0:2 = "slicing-test-op"(%arg0, %arg0): (index, index) -> (index, index)
   return %0#0, %0#1 : index, index
 }
+
+// -----
+
+// FWD-LABEL: graph_region_with_cycle
+// BWD-LABEL: graph_region_with_cycle
+// FWDBWD-LABEL: graph_region_with_cycle
+func.func @graph_region_with_cycle() {
+  test.isolated_graph_region {
+    // FWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // FWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    
+    // BWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // BWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    %0 = "slicing-test-op"(%1) : (i1) -> i1
+    %1 = "slicing-test-op"(%0) : (i1) -> i1
+  }
+
+  return
+}
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 3d5a46d13e59..cf570beb14b9 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -3363,3 +3363,18 @@ func.func @bf16_fma(%arg0: vector<32x32x32xbf16>, %arg1: vector<32x32x32xbf16>,
     }
   }
 #-}
+
+// CHECK-LABEL: func @unreachable()
+// CHECK-NEXT: return
+// CHECK-NOT: arith
+func.func @unreachable() {
+  return
+^unreachable:
+  %c1_i64 = arith.constant 1 : i64
+  // This self referencing operation is legal in an unreachable block.
+  // Many patterns are unsafe with respect to this kind of situation,
+  // check that we don't infinite loop here.
+  %add = arith.addi %add, %c1_i64 : i64
+  cf.br ^unreachable
+}
+
diff --git a/mlir/test/Dialect/Mesh/simplifications.mlir b/mlir/test/Dialect/Mesh/simplifications.mlir
index 2540fbf9510c..e955f4c13425 100644
--- a/mlir/test/Dialect/Mesh/simplifications.mlir
+++ b/mlir/test/Dialect/Mesh/simplifications.mlir
@@ -165,3 +165,15 @@ func.func @all_reduce_arith_minsi_endomorphism(
   // CHECK: return %[[ALL_REDUCE_RES]]
   return %2 : tensor<5xi32>
 }
+
+// Ensure this case without endomorphism op not crash.
+// CHECK-LABEL: func.func @no_endomorphism_op
+func.func @no_endomorphism_op(%arg0: tensor<2xi64>) -> i64 {
+  %c0 = arith.constant 0 : index
+  %c1_i64 = arith.constant 1 : i64
+  // CHECK: tensor.extract
+  %extracted = tensor.extract %arg0[%c0] : tensor<2xi64>
+  // CHECK: arith.maxsi
+  %0 = arith.maxsi %extracted, %c1_i64 : i64
+  return %0 : i64
+}
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 8ba8013d008a..b15fabdd29c6 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1925,3 +1925,16 @@ func.func @index_switch_fold_no_res() {
 
 // CHECK-LABEL: func.func @index_switch_fold_no_res()
 //  CHECK-NEXT: "test.op"() : () -> ()
+
+// -----
+
+// CHECK-LABEL: func @scf_for_all_step_size_0()
+//       CHECK:   scf.forall (%{{.*}}) = (0) to (1) step (0)
+func.func @scf_for_all_step_size_0()  {
+  %x = arith.constant 0 : index
+  scf.forall (%i, %j) = (0, 4) to (1, 5) step (%x, 8) {
+    vector.print %x : index
+    scf.forall.in_parallel {}
+  }
+  return
+}
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index 12187dd18012..ea2343efd246 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -1379,6 +1379,21 @@ func.func @extract_strided_broadcast4(%arg0: f32) -> vector<1x4xf32> {
 
 // -----
 
+// Check the case where the same dimension is both broadcasted and sliced 
+// CHECK-LABEL: func @extract_strided_broadcast5
+//  CHECK-SAME: (%[[ARG:.+]]: vector<2x1xf32>)
+//       CHECK: %[[V:.+]] = vector.broadcast %[[ARG]] : vector<2x1xf32> to vector<2x4xf32>
+//       CHECK: return %[[V]]
+func.func @extract_strided_broadcast5(%arg0: vector<2x1xf32>) -> vector<2x4xf32> {
+ %0 = vector.broadcast %arg0 : vector<2x1xf32> to vector<2x8xf32>
+ %1 = vector.extract_strided_slice %0
+      {offsets = [0, 4], sizes = [2, 4], strides = [1, 1]}
+      : vector<2x8xf32> to vector<2x4xf32>
+  return %1 : vector<2x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: consecutive_shape_cast
 //       CHECK:   %[[C:.*]] = vector.shape_cast %{{.*}} : vector<16xf16> to vector<4x4xf16>
 //  CHECK-NEXT:   return %[[C]] : vector<4x4xf16>
diff --git a/mlir/test/IR/recursive-distinct-attr.mlir b/mlir/test/IR/recursive-distinct-attr.mlir
new file mode 100644
index 000000000000..5afb5c59e0fc
--- /dev/null
+++ b/mlir/test/IR/recursive-distinct-attr.mlir
@@ -0,0 +1,13 @@
+// RUN: mlir-opt -emit-bytecode %s | mlir-opt --mlir-print-debuginfo | FileCheck %s
+
+// Verify that the distinct attribute which is used transitively
+// through two aliases does not end up duplicated when round-tripped
+// through bytecode.
+
+// CHECK: distinct[0]
+// CHECK-NOT: distinct[1]
+#attr_ugly = #test<attr_ugly begin distinct[0]<> end>
+#attr_ugly1 = #test<attr_ugly begin #attr_ugly end>
+
+module attributes {test.alias = #attr_ugly, test.alias1 = #attr_ugly1} {
+}
\ No newline at end of file
diff --git a/mlir/test/IR/test-clone.mlir b/mlir/test/IR/test-clone.mlir
index 0c07593aef32..f723efc1a2c5 100644
--- a/mlir/test/IR/test-clone.mlir
+++ b/mlir/test/IR/test-clone.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(test-clone))" | FileCheck %s
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(test-clone))" --split-input-file | FileCheck %s
 
 module {
   func.func @fixpoint(%arg1 : i32) -> i32 {
@@ -18,7 +18,8 @@ module {
 // CHECK-NEXT: notifyOperationInserted: test.yield
 // CHECK-NEXT: notifyOperationInserted: func.return
 
-// CHECK:   func @fixpoint(%[[arg0:.+]]: i32) -> i32 {
+// CHECK-LABEL: func @fixpoint
+// CHECK-SAME:       (%[[arg0:.+]]: i32) -> i32 {
 // CHECK-NEXT:     %[[i0:.+]] = "test.use"(%[[arg0]]) ({
 // CHECK-NEXT:       %[[r2:.+]] = "test.use2"(%[[arg0]]) ({
 // CHECK-NEXT:         "test.yield2"(%[[arg0]]) : (i32) -> ()
@@ -33,3 +34,33 @@ module {
 // CHECK-NEXT:     }) : (i32) -> i32
 // CHECK-NEXT:     return %[[i1]] : i32
 // CHECK-NEXT:   }
+
+// -----
+
+func.func @clone_unregistered_with_attrs() {
+  "unregistered.foo"() <{bar = 1 : i64, flag = true, name = "test", value = 3.14 : f32}> : () -> ()
+  "unregistered.bar"() : () -> ()
+  "unregistered.empty_dict"() <{}> : () -> ()
+  "unregistered.complex"() <{
+    array = [1, 2, 3],
+    dict = {key1 = 42 : i32, key2 = "value"},
+    nested = {inner = {deep = 100 : i64}}
+  }> : () -> ()
+  return
+}
+
+// CHECK: notifyOperationInserted: unregistered.foo
+// CHECK-NEXT: notifyOperationInserted: unregistered.bar
+// CHECK-NEXT: notifyOperationInserted: unregistered.empty_dict
+// CHECK-NEXT: notifyOperationInserted: unregistered.complex
+// CHECK-NEXT: notifyOperationInserted: func.return
+
+// CHECK-LABEL:  func @clone_unregistered_with_attrs() {
+// CHECK-NEXT:     "unregistered.foo"() <{bar = 1 : i64, flag = true, name = "test", value = [[PI:.+]] : f32}> : () -> ()
+// CHECK-NEXT:     "unregistered.bar"() : () -> ()
+// CHECK-NEXT:     "unregistered.empty_dict"() <{}> : () -> ()
+// CHECK-NEXT:     "unregistered.complex"() <{array = [1, 2, 3], dict = {key1 = 42 : i32, key2 = "value"}, nested = {inner = {deep = 100 : i64}}}> : () -> ()
+// CHECK-NEXT:     "unregistered.foo"() <{bar = 1 : i64, flag = true, name = "test", value = [[PI]] : f32}> : () -> ()
+// CHECK-NEXT:     "unregistered.bar"() : () -> ()
+// CHECK-NEXT:     "unregistered.empty_dict"() <{}> : () -> ()
+// CHECK-NEXT:     "unregistered.complex"() <{array = [1, 2, 3], dict = {key1 = 42 : i32, key2 = "value"}, nested = {inner = {deep = 100 : i64}}}> : () -> ()
diff --git a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir
index 02f7e60671c9..c3063416b036 100644
--- a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir
+++ b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir
@@ -40,12 +40,12 @@ func.func @move_before(%cond : i1) {
 }
 
 // Check that the driver handles rewriter.moveAfter. In this case, we expect
-// the moved op to be visited only once since walk uses `make_early_inc_range`.
+// the moved op to be visited twice.
 // CHECK-LABEL: func.func @move_after(
 // CHECK: scf.if
 // CHECK: }
 // CHECK: "test.move_after_parent_op"
-// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> ()
+// CHECK: "test.any_attr_of_i32_str"() <{attr = 2 : i32}> : () -> ()
 // CHECK: return
 func.func @move_after(%cond : i1) {
   scf.if %cond {
@@ -119,3 +119,23 @@ func.func @erase_nested_block() -> i32 {
   }): () -> (i32)
   return %a : i32
 }
+
+
+// CHECK-LABEL: func.func @unreachable_replace_with_new_op
+// CHECK: "test.new_op"
+// CHECK: "test.replace_with_new_op"
+// CHECK-SAME: unreachable
+// CHECK: "test.new_op"
+func.func @unreachable_replace_with_new_op() {
+  "test.br"()[^bb1] : () -> ()
+^bb1:
+  %a = "test.replace_with_new_op"() : () -> (i32)
+  "test.br"()[^end] : () -> () // Test jumping over the unreachable block is visited as well.
+^unreachable:
+  %b = "test.replace_with_new_op"() {test.unreachable} : () -> (i32)
+  return
+^end:
+  %c = "test.replace_with_new_op"() : () -> (i32)
+  return
+}
+
diff --git a/mlir/test/Pass/pipeline-options-parsing.mlir b/mlir/test/Pass/pipeline-options-parsing.mlir
index 9385d353faf9..03ac38ea1611 100644
--- a/mlir/test/Pass/pipeline-options-parsing.mlir
+++ b/mlir/test/Pass/pipeline-options-parsing.mlir
@@ -13,6 +13,7 @@
 // RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(builtin.module(func.func(test-options-pass{list=3}), func.func(test-options-pass{enum=one list=1,2,3,4 string=foo"bar"baz})))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_6 %s
 // RUN: mlir-opt %s -verify-each=false '-test-options-super-pass-pipeline=super-list={{enum=zero list=1 string=foo},{enum=one list=2 string="bar"},{enum=two list=3 string={baz}}}' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s
 // RUN: mlir-opt %s -verify-each=false -pass-pipeline='builtin.module(func.func(test-options-super-pass{list={{enum=zero list={1} string=foo },{enum=one list={2} string=bar },{enum=two list={3} string=baz }}}))' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_7 %s
+// RUN: mlir-opt %s -verify-each=false -test-options-super-set-ab-pipeline='foo=true bar=false' -dump-pass-pipeline 2>&1 | FileCheck --check-prefix=CHECK_11 %s
 
 
 // This test checks that lists-of-nested-options like 'option1={...},{....}' can be parsed
@@ -106,3 +107,12 @@
 // CHECK_10-NEXT:     test-options-pass{enum=zero  string= string-list={,}}
 // CHECK_10-NEXT:   )
 // CHECK_10-NEXT: )
+
+// CHECK_11:      builtin.module(
+// CHECK_11-NEXT:   func.func(
+// CHECK_11-NEXT:     test-options-pass-a
+// CHECK_11-NEXT:   )
+// CHECK_11-NEXT:   func.func(
+// CHECK_11-NEXT:     test-options-pass-b
+// CHECK_11-NEXT:   )
+// CHECK_11-NEXT: )
diff --git a/mlir/test/Transforms/remove-dead-values.mlir b/mlir/test/Transforms/remove-dead-values.mlir
index 3af95db3c0e2..0f8d757086e8 100644
--- a/mlir/test/Transforms/remove-dead-values.mlir
+++ b/mlir/test/Transforms/remove-dead-values.mlir
@@ -548,3 +548,47 @@ func.func @test_atomic_yield(%I: memref<10xf32>, %idx : index) {
   func.return
 }
 
+// -----
+
+// CHECK-LABEL: module @return_void_with_unused_argument
+module @return_void_with_unused_argument {
+  // CHECK-LABEL: func.func private @fn_return_void_with_unused_argument
+  // CHECK-SAME: (%[[ARG0_FN:.*]]: i32)
+  func.func private @fn_return_void_with_unused_argument(%arg0: i32, %arg1: memref<4xi32>) -> () {
+    %sum = arith.addi %arg0, %arg0 : i32
+    %c0 = arith.constant 0 : index
+    %buf = memref.alloc() : memref<1xi32>
+    memref.store %sum, %buf[%c0] : memref<1xi32>
+    return
+  }
+  // CHECK-LABEL: func.func @main
+  // CHECK-SAME: (%[[ARG0_MAIN:.*]]: i32)
+  // CHECK: call @fn_return_void_with_unused_argument(%[[ARG0_MAIN]]) : (i32) -> ()
+  func.func @main(%arg0: i32) -> memref<4xi32> {
+    %unused = memref.alloc() : memref<4xi32>
+    call @fn_return_void_with_unused_argument(%arg0, %unused) : (i32, memref<4xi32>) -> ()
+    return %unused : memref<4xi32>
+  }
+}
+
+// -----
+
+// CHECK-LABEL: module @dynamically_unreachable
+module @dynamically_unreachable {
+  func.func @dynamically_unreachable() {
+    // This value is used by an operation in a dynamically unreachable block.
+    %zero = arith.constant 0 : i64
+
+    // Dataflow analysis knows from the constant condition that
+    // ^bb1 is unreachable
+    %false = arith.constant false
+    cf.cond_br %false, ^bb1, ^bb4
+  ^bb1:
+    // This unreachable operation should be removed.
+    // CHECK-NOT: arith.cmpi
+    %3 = arith.cmpi eq, %zero, %zero : i64
+    cf.br ^bb1
+  ^bb4:
+    return
+  }
+}
diff --git a/mlir/test/Transforms/test-canonicalize.mlir b/mlir/test/Transforms/test-canonicalize.mlir
index 0fc822b0a23a..8cad6b98441d 100644
--- a/mlir/test/Transforms/test-canonicalize.mlir
+++ b/mlir/test/Transforms/test-canonicalize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s
+// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s  --check-prefixes=CHECK,RS
 // RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{region-simplify=disabled}))' | FileCheck %s --check-prefixes=CHECK,NO-RS
 
 // CHECK-LABEL: func @remove_op_with_inner_ops_pattern
@@ -80,12 +80,10 @@ func.func @test_dialect_canonicalizer() -> (i32) {
 
 // Check that the option to control region simplification actually works
 // CHECK-LABEL: test_region_simplify
-func.func @test_region_simplify() {
-  // CHECK-NEXT:   return
-  // NO-RS-NEXT: ^bb1
-  // NO-RS-NEXT:   return
-  // CHECK-NEXT: }
-  return
-^bb1:
-  return
+func.func @test_region_simplify(%input1 : i32, %cond : i1) -> i32 {
+  // RS-NEXT: "test.br"(%arg0)[^bb1] : (i32) -> ()
+  // NO-RS-NEXT: "test.br"(%arg0, %arg0)[^bb1] : (i32, i32) -> ()
+   "test.br"(%input1, %input1)[^bb1] : (i32, i32) -> ()
+^bb1(%used_arg : i32, %unused_arg : i32):
+  return %used_arg : i32
 }
diff --git a/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp b/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp
index 43005e22584c..8e2f03b644e4 100644
--- a/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp
+++ b/mlir/test/lib/Analysis/DataFlow/TestLivenessAnalysis.cpp
@@ -33,7 +33,6 @@ struct TestLivenessAnalysisPass
 
   void runOnOperation() override {
     auto &livenessAnalysis = getAnalysis<RunLivenessAnalysis>();
-
     Operation *op = getOperation();
 
     raw_ostream &os = llvm::outs();
diff --git a/mlir/test/lib/Pass/TestPassManager.cpp b/mlir/test/lib/Pass/TestPassManager.cpp
index 7afe2109f04d..2b5f75ef53f1 100644
--- a/mlir/test/lib/Pass/TestPassManager.cpp
+++ b/mlir/test/lib/Pass/TestPassManager.cpp
@@ -133,6 +133,51 @@ struct TestOptionsSuperPass
       llvm::cl::desc("Example list of PassPipelineOptions option")};
 };
 
+struct TestOptionsPassA
+    : public PassWrapper<TestOptionsPassA, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassA)
+
+  struct Options : public PassPipelineOptions<Options> {
+    Option<bool> foo{*this, "foo", llvm::cl::desc("Example boolean option")};
+  };
+
+  TestOptionsPassA() = default;
+  TestOptionsPassA(const TestOptionsPassA &) : PassWrapper() {}
+  TestOptionsPassA(const Options &options) { this->options.foo = options.foo; }
+
+  void runOnOperation() final {}
+  StringRef getArgument() const final { return "test-options-pass-a"; }
+  StringRef getDescription() const final {
+    return "Test superset options parsing capabilities - subset A";
+  }
+
+  Options options;
+};
+
+struct TestOptionsPassB
+    : public PassWrapper<TestOptionsPassB, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOptionsPassB)
+
+  struct Options : public PassPipelineOptions<Options> {
+    Option<bool> bar{*this, "bar", llvm::cl::desc("Example boolean option")};
+  };
+
+  TestOptionsPassB() = default;
+  TestOptionsPassB(const TestOptionsPassB &) : PassWrapper() {}
+  TestOptionsPassB(const Options &options) { this->options.bar = options.bar; }
+
+  void runOnOperation() final {}
+  StringRef getArgument() const final { return "test-options-pass-b"; }
+  StringRef getDescription() const final {
+    return "Test superset options parsing capabilities - subset B";
+  }
+
+  Options options;
+};
+
+struct TestPipelineOptionsSuperSetAB : TestOptionsPassA::Options,
+                                       TestOptionsPassB::Options {};
+
 /// A test pass that always aborts to enable testing the crash recovery
 /// mechanism of the pass manager.
 struct TestCrashRecoveryPass
@@ -270,6 +315,9 @@ void registerPassManagerTestPass() {
   PassRegistration<TestOptionsPass>();
   PassRegistration<TestOptionsSuperPass>();
 
+  PassRegistration<TestOptionsPassA>();
+  PassRegistration<TestOptionsPassB>();
+
   PassRegistration<TestModulePass>();
 
   PassRegistration<TestFunctionPass>();
@@ -306,5 +354,16 @@ void registerPassManagerTestPass() {
           [](OpPassManager &pm, const TestOptionsSuperPass::Options &options) {
             pm.addPass(std::make_unique<TestOptionsSuperPass>(options));
           });
+
+  PassPipelineRegistration<TestPipelineOptionsSuperSetAB>
+      registerPipelineOptionsSuperSetABPipeline(
+          "test-options-super-set-ab-pipeline",
+          "Parses options of PassPipelineOptions using pass pipeline "
+          "registration",
+          [](OpPassManager &pm, const TestPipelineOptionsSuperSetAB &options) {
+            // Pass superset AB options to subset options A and B
+            pm.addPass(std::make_unique<TestOptionsPassA>(options));
+            pm.addPass(std::make_unique<TestOptionsPassB>(options));
+          });
 }
 } // namespace mlir
diff --git a/mlir/test/mlir-tblgen/enums-python-bindings.td b/mlir/test/mlir-tblgen/enums-python-bindings.td
index 1c5567f54a5f..cd23b6a2effb 100644
--- a/mlir/test/mlir-tblgen/enums-python-bindings.td
+++ b/mlir/test/mlir-tblgen/enums-python-bindings.td
@@ -62,12 +62,15 @@ def MyEnum64 : I64EnumAttr<"MyEnum64", "An example 64-bit enum", [One64, Two64]>
 // CHECK: def _myenum64(x, context):
 // CHECK:     return _ods_ir.IntegerAttr.get(_ods_ir.IntegerType.get_signless(64, context=context), int(x))
 
+def User : I32BitEnumAttrCaseBit<"User", 0, "user">;
+def Group : I32BitEnumAttrCaseBit<"Group", 1, "group">;
+def Other : I32BitEnumAttrCaseBit<"Other", 2, "other">;
+
 def TestBitEnum
-    : I32BitEnumAttr<"TestBitEnum", "", [
-        I32BitEnumAttrCaseBit<"User", 0, "user">,
-        I32BitEnumAttrCaseBit<"Group", 1, "group">,
-        I32BitEnumAttrCaseBit<"Other", 2, "other">,
-      ]> {
+    : I32BitEnumAttr<
+          "TestBitEnum", "",
+          [User, Group, Other,
+           I32BitEnumAttrCaseGroup<"Any", [User, Group, Other], "any">]> {
   let genSpecializedAttr = 0;
   let separator = " | ";
 }
@@ -79,9 +82,10 @@ def TestBitEnum_Attr : EnumAttr<Test_Dialect, TestBitEnum, "testbitenum">;
 // CHECK:     User = 1
 // CHECK:     Group = 2
 // CHECK:     Other = 4
+// CHECK:     Any = 7
 
 // CHECK:     def __iter__(self):
-// CHECK:         return iter([case for case in type(self) if (self & case) is case])
+// CHECK:         return iter([case for case in type(self) if (self & case) is case and self is not case])
 // CHECK:     def __len__(self):
 // CHECK:         return bin(self).count("1")
 
@@ -94,6 +98,8 @@ def TestBitEnum_Attr : EnumAttr<Test_Dialect, TestBitEnum, "testbitenum">;
 // CHECK:             return "group"
 // CHECK:         if self is TestBitEnum.Other:
 // CHECK:             return "other"
+// CHECK:         if self is TestBitEnum.Any:
+// CHECK:             return "any"
 // CHECK:         raise ValueError("Unknown TestBitEnum enum entry.")
 
 // CHECK: @register_attribute_builder("TestBitEnum")
diff --git a/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp
index 8e2d6114e48e..acc9b61d7121 100644
--- a/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/EnumPythonBindingGen.cpp
@@ -64,7 +64,7 @@ static void emitEnumClass(EnumInfo enumInfo, raw_ostream &os) {
   if (enumInfo.isBitEnum()) {
     os << formatv("    def __iter__(self):\n"
                   "        return iter([case for case in type(self) if "
-                  "(self & case) is case])\n");
+                  "(self & case) is case and self is not case])\n");
     os << formatv("    def __len__(self):\n"
                   "        return bin(self).count(\"1\")\n");
     os << "\n";
diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp
index 887db227cfc4..312b10f28143 100644
--- a/mlir/unittests/ExecutionEngine/Invoke.cpp
+++ b/mlir/unittests/ExecutionEngine/Invoke.cpp
@@ -205,7 +205,13 @@ TEST(NativeMemRefJit, SKIP_WITHOUT_JIT(BasicMemref)) {
   };
   int64_t shape[] = {k, m};
   int64_t shapeAlloc[] = {k + 1, m + 1};
-  OwningMemRef<float, 2> a(shape, shapeAlloc, init);
+  // Use a large alignment to stress the case where the memref data/basePtr are
+  // disjoint.
+  int alignment = 8192;
+  OwningMemRef<float, 2> a(shape, shapeAlloc, init, alignment);
+  ASSERT_EQ(
+      (void *)(((uintptr_t)a->basePtr + alignment - 1) & ~(alignment - 1)),
+      a->data);
   ASSERT_EQ(a->sizes[0], k);
   ASSERT_EQ(a->sizes[1], m);
   ASSERT_EQ(a->strides[0], m + 1);
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index d22afb3003e7..a46e64718dab 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_unittest(MLIRIRTests
   AttrTypeReplacerTest.cpp
   Diagnostic.cpp
   DialectTest.cpp
+  DistinctAttributeAllocatorTest.cpp
   InterfaceTest.cpp
   IRMapping.cpp
   InterfaceAttachmentTest.cpp
diff --git a/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
new file mode 100644
index 000000000000..99067d09f7be
--- /dev/null
+++ b/mlir/unittests/IR/DistinctAttributeAllocatorTest.cpp
@@ -0,0 +1,45 @@
+//=== DistinctAttributeAllocatorTest.cpp - DistinctAttr storage alloc test ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "llvm/Support/CrashRecoveryContext.h"
+#include <thread>
+
+using namespace mlir;
+
+//
+// Test that a DistinctAttr that is created on a separate thread does
+// not have its storage deleted when the thread joins.
+//
+TEST(DistinctAttributeAllocatorTest, TestAttributeWellFormedAfterThreadJoin) {
+  MLIRContext ctx;
+  OpBuilder builder(&ctx);
+  DistinctAttr attr;
+
+  std::thread t([&ctx, &attr]() {
+    attr = DistinctAttr::create(UnitAttr::get(&ctx));
+    ASSERT_TRUE(attr);
+  });
+  t.join();
+
+  // If the attribute storage got deleted after the thread joins (which we don't
+  // want) then trying to access it triggers an assert in Debug mode, and a
+  // crash otherwise. Run this in a CrashRecoveryContext to avoid bringing down
+  // the whole test suite if this test fails. Additionally, MSAN and/or TSAN
+  // should raise failures here if the attribute storage was deleted.
+  llvm::CrashRecoveryContext crc;
+  EXPECT_TRUE(crc.RunSafely([attr]() { (void)attr.getAbstractAttribute(); }));
+  EXPECT_TRUE(
+      crc.RunSafely([attr]() { (void)*cast<Attribute>(attr).getImpl(); }));
+
+  ASSERT_TRUE(attr);
+}